public virtual void Test4to2() { CharFilter cs = new MappingCharFilter(normMap, new StringReader("cccc")); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "cc" }, new int[] { 0 }, new int[] { 4 }, 4); }
public virtual void TestNonBMPChar() { CharFilter cs = new MappingCharFilter(normMap, new StringReader(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1))); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "fclef" }, new int[] { 0 }, new int[] { 2 }, 2); }
public virtual void Test5to0() { CharFilter cs = new MappingCharFilter(normMap, new StringReader("empty")); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[0], new int[] { }, new int[] { }, 5); }
public virtual void TestFullWidthChar() { CharFilter cs = new MappingCharFilter(normMap, new StringReader("\uff01")); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "full-width-exclamation" }, new int[] { 0 }, new int[] { 1 }, 1); }
public virtual void Test1to3() { CharFilter cs = new MappingCharFilter(normMap, new StringReader("k")); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "kkk" }, new int[] { 0 }, new int[] { 1 }, 1); }
public virtual void Test3to1() { CharFilter cs = new MappingCharFilter(normMap, new StringReader("bbb")); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "b" }, new int[] { 0 }, new int[] { 3 }, 3); }
public virtual void TestTokenStream() { string testString = "h i j k ll cccc bbb aa"; CharFilter cs = new MappingCharFilter(normMap, new StringReader(testString)); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "i", "i", "jj", "kkk", "llll", "cc", "b", "a" }, new int[] { 0, 2, 4, 6, 8, 11, 16, 20 }, new int[] { 1, 3, 5, 7, 10, 15, 19, 22 }, testString.Length); }
public virtual void TestChained() { string testString = "aaaa ll h"; CharFilter cs = new MappingCharFilter(normMap, new MappingCharFilter(normMap, new StringReader(testString))); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "a", "llllllll", "i" }, new int[] { 0, 5, 8 }, new int[] { 4, 7, 9 }, testString.Length); }
public virtual void TestReaderReset() { CharFilter cs = new MappingCharFilter(normMap, new StringReader("x")); char[] buf = new char[10]; int len = cs.Read(buf, 0, 10); assertEquals(1, len); assertEquals('x', buf[0]); len = cs.Read(buf, 0, 10); assertEquals(-1, len); // rewind cs.Reset(); len = cs.Read(buf, 0, 10); assertEquals(1, len); assertEquals('x', buf[0]); }
public virtual void TestReaderReset() { CharFilter cs = new MappingCharFilter(normMap, new StringReader("x")); char[] buf = new char[10]; int len = cs.Read(buf, 0, 10); assertEquals(1, len); assertEquals('x', buf[0]); len = cs.Read(buf, 0, 10); assertEquals(-1, len); // rewind cs.Reset(); len = cs.Read(buf, 0, 10); assertEquals(1, len); assertEquals('x', buf[0]); }
public virtual void TestOffsetCorrection() { const string INPUT = "Günther Günther is here"; // create MappingCharFilter IList<string> mappingRules = new List<string>(); mappingRules.Add("\"ü\" => \"ü\""); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("ü", "ü"); NormalizeCharMap normMap = builder.Build(); CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); // create PatternTokenizer TokenStream stream = new PatternTokenizer(charStream, new Regex("[,;/\\s]+", RegexOptions.Compiled), -1); AssertTokenStreamContents(stream, new string[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.Length); charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); stream = new PatternTokenizer(charStream, new Regex("Günther", RegexOptions.Compiled), 0); AssertTokenStreamContents(stream, new string[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.Length); }
public override TextReader InitReader(string fieldName, TextReader reader) { reader = new MockCharFilter(reader, 0); reader = new MappingCharFilter(map, reader); return reader; }
public virtual void TestRandomMaps2() { Random random = Random(); int numIterations = AtLeast(3); for (int iter = 0; iter < numIterations; iter++) { if (VERBOSE) { Console.WriteLine("\nTEST iter=" + iter); } char endLetter = (char)TestUtil.NextInt(random, 'b', 'z'); IDictionary <string, string> map = new Dictionary <string, string>(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); int numMappings = AtLeast(5); if (VERBOSE) { Console.WriteLine(" mappings:"); } while (map.Count < numMappings) { string key = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, 7); if (key.Length != 0 && !map.ContainsKey(key)) { string value = TestUtil.RandomSimpleString(random); map[key] = value; builder.Add(key, value); if (VERBOSE) { Console.WriteLine(" " + key + " -> " + value); } } } NormalizeCharMap charMap = builder.Build(); if (VERBOSE) { Console.WriteLine(" test random documents..."); } for (int iter2 = 0; iter2 < 100; iter2++) { string content = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, AtLeast(1000)); if (VERBOSE) { Console.WriteLine(" content=" + content); } // Do stupid dog-slow mapping: // Output string: StringBuilder output = new StringBuilder(); // Maps output offset to input offset: IList <int?> inputOffsets = new List <int?>(); int cumDiff = 0; int charIdx = 0; while (charIdx < content.Length) { int matchLen = -1; string matchRepl = null; foreach (KeyValuePair <string, string> ent in map) { string match = ent.Key; if (charIdx + match.Length <= content.Length) { int limit = charIdx + match.Length; bool matches = true; for (int charIdx2 = charIdx; charIdx2 < limit; charIdx2++) { if (match[charIdx2 - charIdx] != content[charIdx2]) { matches = false; break; } } if (matches) { string repl = ent.Value; if (match.Length > matchLen) { // Greedy: longer match wins matchLen = match.Length; matchRepl = repl; } } } } if (matchLen != -1) { // We found a match here! if (VERBOSE) { Console.WriteLine(" match=" + content.Substring(charIdx, matchLen) + " @ off=" + charIdx + " repl=" + matchRepl); } output.Append(matchRepl); int minLen = Math.Min(matchLen, matchRepl.Length); // Common part, directly maps back to input // offset: for (int outIdx = 0; outIdx < minLen; outIdx++) { inputOffsets.Add(output.Length - matchRepl.Length + outIdx + cumDiff); } cumDiff += matchLen - matchRepl.Length; charIdx += matchLen; if (matchRepl.Length < matchLen) { // Replacement string is shorter than matched // input: nothing to do } else if (matchRepl.Length > matchLen) { // Replacement string is longer than matched // input: for all the "extra" chars we map // back to a single input offset: for (int outIdx = matchLen; outIdx < matchRepl.Length; outIdx++) { inputOffsets.Add(output.Length + cumDiff - 1); } } else { // Same length: no change to offset } Debug.Assert(inputOffsets.Count == output.Length, "inputOffsets.size()=" + inputOffsets.Count + " vs output.length()=" + output.Length); } else { inputOffsets.Add(output.Length + cumDiff); output.Append(content[charIdx]); charIdx++; } } string expected = output.ToString(); if (VERBOSE) { Console.Write(" expected:"); for (int charIdx2 = 0; charIdx2 < expected.Length; charIdx2++) { Console.Write(" " + expected[charIdx2] + "/" + inputOffsets[charIdx2]); } Console.WriteLine(); } MappingCharFilter mapFilter = new MappingCharFilter(charMap, new StringReader(content)); StringBuilder actualBuilder = new StringBuilder(); IList <int?> actualInputOffsets = new List <int?>(); // Now consume the actual mapFilter, somewhat randomly: while (true) { if (random.Next(0, 1) == 1) { int ch = mapFilter.Read(); if (ch == -1) { break; } actualBuilder.Append((char)ch); } else { char[] buffer = new char[TestUtil.NextInt(random, 1, 100)]; int off = buffer.Length == 1 ? 0 : random.Next(buffer.Length - 1); int count = mapFilter.Read(buffer, off, buffer.Length - off); if (count == -1) { break; } else { actualBuilder.Append(buffer, off, count); } } if (random.Next(10) == 7) { // Map offsets while (actualInputOffsets.Count < actualBuilder.Length) { actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count)); } } } // Finish mappping offsets while (actualInputOffsets.Count < actualBuilder.Length) { actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count)); } string actual = actualBuilder.ToString(); // Verify: assertEquals(expected, actual); assertEquals(inputOffsets, actualInputOffsets); } } }
public virtual void Test5to0() { CharFilter cs = new MappingCharFilter(normMap, new StringReader("empty")); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[0], new int[] { }, new int[] { }, 5); }
public virtual void TestNonBMPChar() { CharFilter cs = new MappingCharFilter(normMap, new StringReader(UnicodeUtil.NewString(new int[] { 0x1D122 }, 0, 1))); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "fclef" }, new int[] { 0 }, new int[] { 2 }, 2); }
public virtual void TestNormalizeWinDelimToLinuxDelim() { NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("\\", "/"); NormalizeCharMap normMap = builder.Build(); string path = "c:\\a\\b\\c"; Reader cs = new MappingCharFilter(normMap, new StringReader(path)); PathHierarchyTokenizer t = new PathHierarchyTokenizer(cs); AssertTokenStreamContents(t, new string[] { "c:", "c:/a", "c:/a/b", "c:/a/b/c" }, new int[] { 0, 0, 0, 0 }, new int[] { 2, 4, 6, 8 }, new int[] { 1, 0, 0, 0 }, path.Length); }
public virtual void TestFullWidthChar() { CharFilter cs = new MappingCharFilter(normMap, new StringReader("\uff01")); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "full-width-exclamation" }, new int[] { 0 }, new int[] { 1 }, 1); }
public virtual void TestTokenStream() { string testString = "h i j k ll cccc bbb aa"; CharFilter cs = new MappingCharFilter(normMap, new StringReader(testString)); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "i", "i", "jj", "kkk", "llll", "cc", "b", "a" }, new int[] { 0, 2, 4, 6, 8, 11, 16, 20 }, new int[] { 1, 3, 5, 7, 10, 15, 19, 22 }, testString.Length); }
public virtual void TestChained() { string testString = "aaaa ll h"; CharFilter cs = new MappingCharFilter(normMap, new MappingCharFilter(normMap, new StringReader(testString))); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "a", "llllllll", "i" }, new int[] { 0, 5, 8 }, new int[] { 4, 7, 9 }, testString.Length); }
public virtual void TestRandomMaps2() { Random random = Random(); int numIterations = AtLeast(3); for (int iter = 0; iter < numIterations; iter++) { if (VERBOSE) { Console.WriteLine("\nTEST iter=" + iter); } char endLetter = (char)TestUtil.NextInt(random, 'b', 'z'); IDictionary<string, string> map = new Dictionary<string, string>(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); int numMappings = AtLeast(5); if (VERBOSE) { Console.WriteLine(" mappings:"); } while (map.Count < numMappings) { string key = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, 7); if (key.Length != 0 && !map.ContainsKey(key)) { string value = TestUtil.RandomSimpleString(random); map[key] = value; builder.Add(key, value); if (VERBOSE) { Console.WriteLine(" " + key + " -> " + value); } } } NormalizeCharMap charMap = builder.Build(); if (VERBOSE) { Console.WriteLine(" test random documents..."); } for (int iter2 = 0; iter2 < 100; iter2++) { string content = TestUtil.RandomSimpleStringRange(random, 'a', endLetter, AtLeast(1000)); if (VERBOSE) { Console.WriteLine(" content=" + content); } // Do stupid dog-slow mapping: // Output string: StringBuilder output = new StringBuilder(); // Maps output offset to input offset: IList<int?> inputOffsets = new List<int?>(); int cumDiff = 0; int charIdx = 0; while (charIdx < content.Length) { int matchLen = -1; string matchRepl = null; foreach (KeyValuePair<string, string> ent in map) { string match = ent.Key; if (charIdx + match.Length <= content.Length) { int limit = charIdx + match.Length; bool matches = true; for (int charIdx2 = charIdx; charIdx2 < limit; charIdx2++) { if (match[charIdx2 - charIdx] != content[charIdx2]) { matches = false; break; } } if (matches) { string repl = ent.Value; if (match.Length > matchLen) { // Greedy: longer match wins matchLen = match.Length; matchRepl = repl; } } } } if (matchLen != -1) { // We found a match here! if (VERBOSE) { Console.WriteLine(" match=" + content.Substring(charIdx, matchLen) + " @ off=" + charIdx + " repl=" + matchRepl); } output.Append(matchRepl); int minLen = Math.Min(matchLen, matchRepl.Length); // Common part, directly maps back to input // offset: for (int outIdx = 0; outIdx < minLen; outIdx++) { inputOffsets.Add(output.Length - matchRepl.Length + outIdx + cumDiff); } cumDiff += matchLen - matchRepl.Length; charIdx += matchLen; if (matchRepl.Length < matchLen) { // Replacement string is shorter than matched // input: nothing to do } else if (matchRepl.Length > matchLen) { // Replacement string is longer than matched // input: for all the "extra" chars we map // back to a single input offset: for (int outIdx = matchLen; outIdx < matchRepl.Length; outIdx++) { inputOffsets.Add(output.Length + cumDiff - 1); } } else { // Same length: no change to offset } Debug.Assert(inputOffsets.Count == output.Length, "inputOffsets.size()=" + inputOffsets.Count + " vs output.length()=" + output.Length); } else { inputOffsets.Add(output.Length + cumDiff); output.Append(content[charIdx]); charIdx++; } } string expected = output.ToString(); if (VERBOSE) { Console.Write(" expected:"); for (int charIdx2 = 0; charIdx2 < expected.Length; charIdx2++) { Console.Write(" " + expected[charIdx2] + "/" + inputOffsets[charIdx2]); } Console.WriteLine(); } MappingCharFilter mapFilter = new MappingCharFilter(charMap, new StringReader(content)); StringBuilder actualBuilder = new StringBuilder(); IList<int?> actualInputOffsets = new List<int?>(); // Now consume the actual mapFilter, somewhat randomly: while (true) { if (random.Next(0, 1) == 1) { int ch = mapFilter.Read(); if (ch == -1) { break; } actualBuilder.Append((char)ch); } else { char[] buffer = new char[TestUtil.NextInt(random, 1, 100)]; int off = buffer.Length == 1 ? 0 : random.Next(buffer.Length - 1); int count = mapFilter.Read(buffer, off, buffer.Length - off); if (count == -1) { break; } else { actualBuilder.Append(buffer, off, count); } } if (random.Next(10) == 7) { // Map offsets while (actualInputOffsets.Count < actualBuilder.Length) { actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count)); } } } // Finish mappping offsets while (actualInputOffsets.Count < actualBuilder.Length) { actualInputOffsets.Add(mapFilter.CorrectOffset(actualInputOffsets.Count)); } string actual = actualBuilder.ToString(); // Verify: assertEquals(expected, actual); assertEquals(inputOffsets, actualInputOffsets); } } }
public virtual void Test3to1() { CharFilter cs = new MappingCharFilter(normMap, new StringReader("bbb")); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "b" }, new int[] { 0 }, new int[] { 3 }, 3); }
public virtual void Test1to2() { CharFilter cs = new MappingCharFilter(normMap, new StringReader("j")); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); AssertTokenStreamContents(ts, new string[] { "jj" }, new int[] { 0 }, new int[] { 1 }, 1); }