public void TestAll() { // get most of the mapped accent chars, and their non-accented equiv // must be 1 to 1 mapping and size of arrays. Easier to do lots of chars this // way then with lists string matchChar = "¥µ�����������������������������‗אבגדהוזחטיךכלםמןנסעףפץצרשת�����"; string replaceChar = "SOZsozYYuAAAAAAACEEEEIIIIDNOOOOOOUUUUYsaaaaaaaceeeeiiiionoooooouuuuyy "; // set up a dictionary, if ignore case, set the dict up with a new comparer StringComparer comparer = StringComparer.OrdinalIgnoreCase; // default is just Ordinal Dictionary <string, string> wordDictionary = new Dictionary <string, string>(comparer) { { "chevrolet", "Ford" }, { "mAzDa", "BMW" }, { "and and", "and" } // will never match }; // Need `System.ValueTuple` package to do this style of init // on v4.6 and below List <(string, string)> regxList = new List <(string, string)> { // Match, Replace ("BMW", "Fiat"), (@"\s+", " "), // multi whitespace to 1 space (@"^\s*|\s*$", "") // trims leading/ending spaces }; string expect = "Randy Butternubs"; Scrub st = new Scrub(expect); // Set dictionary up, case insensitive match st.SetStringTranslator(wordDictionary, true); // set up character translators st.SetCharTranslator(matchChar, replaceChar); // set up list of regx replaces st.SetRegxTranslator(regxList); st.SetStringTranslator(); st.SetRegxTranslator(); st.SetCharTranslator(); Assert.AreEqual(0, st.CharTransDict.Count); Assert.AreEqual(0, st.StringTransDict.Count); Assert.AreEqual(0, st.RegxTuples.Count); }
public void SetDict_CharTransDict_Matches() { Scrub st = new Scrub(""); var expectedCharMap = new Dictionary <char, char>() { { 'a', 'A' }, { 'b', 'B' }, { 'c', 'C' } }; st.SetCharTranslator(expectedCharMap); CollectionAssert.AreEqual(expectedCharMap, st.CharTransDict); }
public void SetString_CharTransDict_Matches() { Scrub st = new Scrub(""); string expectedMatchChar = "¥µ�����������������������������‗אבגדהוזחטיךכלםמןנסעףפץצרשת�����"; string expectedReplaceChar = "SOZsozYYuAAAAAAACEEEEIIIIDNOOOOOOUUUUYsaaaaaaaceeeeiiiionoooooouuuuyy "; st.SetCharTranslator(expectedMatchChar, expectedReplaceChar); var match = new string(st.CharTransDict.Keys.ToArray()); var replace = new string(st.CharTransDict.Values.ToArray()); Assert.AreEqual(expectedMatchChar.Length, st.CharTransDict.Count); Assert.AreEqual(expectedMatchChar, match); Assert.AreEqual(expectedReplaceChar, replace); }
public void TestAll() { // get most of the mapped accent chars, and their non-accented equiv // must be 1 to 1 mapping and size of arrays. Easier to do lots of chars this // way then with lists string matchChar = "ŠŒŽšœžŸ¥µÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýÿ¡¿"; string replaceChar = "SOZsozYYuAAAAAAACEEEEIIIIDNOOOOOOUUUUYsaaaaaaaceeeeiiiionoooooouuuuyy "; // set up a dictionary, if ignore case, set the dict up with a new comparer StringComparer comparer = StringComparer.OrdinalIgnoreCase; // default is just Ordinal Dictionary <string, string> wordDictionary = new Dictionary <string, string>(comparer) { { "chevrolet", "Ford" }, { "mAzDa", "BMW" }, { "and and", "and" }, // will never match }; // Need `System.ValueTuple` package to do this style of init // on v4.6 and below List <(string, string)> regxList = new List <(string, string)> { // Match, Replace ("BMW", "Fiat"), (@"\s+", " "), // multi whitespace to 1 space (@"^\s*|\s*$", "") // trims leading/ending spaces }; string sentence = "¿¡Señor, the Chevrolet guys don't like Dodge guys, and and no one like MaZdA, Ola Senor?! "; string expectedSentance = "Magoo the Ford guys don#t like Mercedes guys and and no one like Fiat Ola Magoo?!"; Scrub st = new Scrub(sentence); // Set dictionary up, case insensitive match st.SetStringTranslator(wordDictionary, true); // set up character translators st.SetCharTranslator(matchChar, replaceChar); // set up list of regx replaces st.SetRegxTranslator(regxList); // add a string translation after the fact st.StringTransDict.Add("dodge", "Mercedes"); // add a Regx translation after the fact st.RegxTuples.Add(("Senor", "Mr.Magoo")); // add a chracter Translation after the fact st.CharTransDict.Add('\'', '#'); // so all sorts of stuff! st.Strip("[,]").MapChars().MapWords().RegxTranslate().Strip(@"Mr\."); Assert.AreEqual(expectedSentance, st.ToString()); }
// ReSharper disable once UnusedParameter.Local private static void Main(string[] args) { // Map any character to any other character. The matchCarArray MUST be only // have unique characters. The replaceChar array will have the matching translated char. // The example below of accent chars, and their non-accented equiv // Both strings must be 1 to 1 mapping and size of strings. This was done as strings // to make it easier to deal with lots of characters. string matchChar = "ŠŒŽšœžŸ¥µÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýÿ¡¿"; string replaceChar = "SOZsozYYuAAAAAAACEEEEIIIIDNOOOOOOUUUUYsaaaaaaaceeeeiiiionoooooouuuuyy "; // Set up a dictionary, if ignore case, set the dict up with a new comparer // These words are mapped to any instances of other words. See comments // on how this works vs regx, basically each word from a sentence is passed // to the dictionary for translation. Current or past changes are not candidates // for any further changes StringComparer comparer = StringComparer.OrdinalIgnoreCase; // default is just Ordinal Dictionary <string, string> wordDictionary = new Dictionary <string, string>(comparer) { { "chevrolet", "Ford" }, { "mAzDa", "BMW" }, { "and and", "and" } // will never match }; // NOTE : Need `System.ValueTuple` package to do this style of init on v4.6 and below. // Regx list each item is executed in order of the list. // First element is the Regx match string (C# style) and the second // is the replacement string if the pattern matches. Matches can affect the entire // string, and each subsequent match can as well. List <(string, string)> regxList = new List <(string, string)> { // Match, Replace ("BMW", "Fiat"), // swaps 'BMW' (case dependent) with 'Fiat' (@"\s+", " "), // multi whitespace to 1 space (@"^\s*|\s*$", "") // trims leading/ending spaces }; // Test sentence with odd characters, spaces and other things needing scrubbing string sentence = "¿¡Señor, the Chevrolet guys don't like Dodge guys, and and no one like MaZdA, Ola Senor?! "; // Dump the orig string Console.WriteLine("The Sentence : >{0}<", sentence); Scrub st = new Scrub(sentence); // Set dictionary up, case insensitive match st.SetStringTranslator(wordDictionary, true); // set up character translators st.SetCharTranslator(matchChar, replaceChar); // set up list of regx replaces st.SetRegxTranslator(regxList); // add a string translation after the fact st.StringTransDict.Add("dodge", "Mercedes"); // add a Regx translation after the fact st.RegxTuples.Add(("Senor", "Mr.Magoo")); // add a chracter Translation after the fact st.CharTransDict.Add('\'', '#'); // so all sorts of stuff! string translated = st.Strip("[,]").MapChars().MapWords().RegxTranslate().Strip(@"Mr\.").ToString(); // Should be something like the string below - // Magoo the Ford guys don#t like Mercedes guys and and no one like Fiat Ola Magoo?! Console.WriteLine("Translated : >{0}<", translated); // reset the string with some emails st.Set("[email protected] is sending an email to [email protected]"); translated = st.RegxDefined("Email", "**Email Removed**").ToString(); Console.WriteLine("Masked : >{0}<", translated); st.Set(" 前に来た時は北側からで、当時の光景はいまでも思い出せる。 Even now I remember the scene I saw approaching the city from the north. 青竜山脈から流れる川が湖へと流れこむ様、湖の中央には純白のホ"); translated = st.RegxDefined("NonAscii", string.Empty).ToString(); Console.WriteLine("To all ASCII : >{0}<", translated); // reset the string with some emails st.Set(@"<h1>Title</h1><script>var a=1; \\comment</script> Not In Script Tags"); translated = st.RegxDefined("ScriptTags", string.Empty).RegxDefined("TagsSimple", string.Empty).ToString(); Console.WriteLine("Strip Script and Tags : >{0}<", translated); // reset and set up a predefined match pattern and set regx case sensitivity st.Set("wtf does RemoveWTF do? Is WtF Case SeNsItIvE?"); st.RegxMatchesDefined.Add("RemoveWTF", @"(wtf)|(what the)\s+(hell|$hit)"); translated = st.RegxIgnoreCase().RegxDefined("RemoveWTF", "XXX").ToString(); Console.WriteLine("New Pre-defined Match : >{0}<", translated); }