Exemple #1
0
        public void TestAll()
        {
            // get most of the mapped accent chars, and their non-accented equiv
            // must be 1 to 1 mapping and size of arrays. Easier to do lots of chars this
            // way then with lists

            string matchChar   = "ŠŒŽšœžŸ¥µ�����������������������������‗אבגדהוזחטיךכלםמןנסעףפץצרשת��‎���";
            string replaceChar = "SOZsozYYuAAAAAAACEEEEIIIIDNOOOOOOUUUUYsaaaaaaaceeeeiiiionoooooouuuuyy  ";

            // set up a dictionary, if ignore case, set the dict up with a new comparer

            StringComparer comparer = StringComparer.OrdinalIgnoreCase; // default is just Ordinal
            Dictionary <string, string> wordDictionary = new Dictionary <string, string>(comparer)
            {
                { "chevrolet", "Ford" },
                { "mAzDa", "BMW" },
                { "and and", "and" }  // will never match
            };

            // Need `System.ValueTuple` package to do this style of init
            // on v4.6 and below

            List <(string, string)> regxList = new List <(string, string)>
            {                      // Match, Replace
                ("BMW", "Fiat"),
                (@"\s+", " "),     // multi whitespace to 1 space
                (@"^\s*|\s*$", "") // trims leading/ending spaces
            };

            string expect = "Randy Butternubs";
            Scrub  st     = new Scrub(expect);

            // Set dictionary up, case insensitive match

            st.SetStringTranslator(wordDictionary, true);

            // set up character translators

            st.SetCharTranslator(matchChar, replaceChar);

            // set up list of regx replaces

            st.SetRegxTranslator(regxList);

            st.SetStringTranslator();
            st.SetRegxTranslator();
            st.SetCharTranslator();

            Assert.AreEqual(0, st.CharTransDict.Count);
            Assert.AreEqual(0, st.StringTransDict.Count);
            Assert.AreEqual(0, st.RegxTuples.Count);
        }
Exemple #2
0
        public void SetDict_CharTransDict_Matches()
        {
            Scrub st = new Scrub("");

            var expectedCharMap = new Dictionary <char, char>()
            {
                { 'a', 'A' },
                { 'b', 'B' },
                { 'c', 'C' }
            };

            st.SetCharTranslator(expectedCharMap);

            CollectionAssert.AreEqual(expectedCharMap, st.CharTransDict);
        }
Exemple #3
0
        public void SetString_CharTransDict_Matches()
        {
            Scrub st = new Scrub("");

            string expectedMatchChar   = "ŠŒŽšœžŸ¥µ�����������������������������‗אבגדהוזחטיךכלםמןנסעףפץצרשת��‎���";
            string expectedReplaceChar = "SOZsozYYuAAAAAAACEEEEIIIIDNOOOOOOUUUUYsaaaaaaaceeeeiiiionoooooouuuuyy  ";

            st.SetCharTranslator(expectedMatchChar, expectedReplaceChar);

            var match   = new string(st.CharTransDict.Keys.ToArray());
            var replace = new string(st.CharTransDict.Values.ToArray());

            Assert.AreEqual(expectedMatchChar.Length, st.CharTransDict.Count);
            Assert.AreEqual(expectedMatchChar, match);
            Assert.AreEqual(expectedReplaceChar, replace);
        }
        public void TestAll()
        {
            // get most of the mapped accent chars, and their non-accented equiv
            // must be 1 to 1 mapping and size of arrays. Easier to do lots of chars this
            // way then with lists

            string matchChar   = "ŠŒŽšœžŸ¥µÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýÿ¡¿";
            string replaceChar = "SOZsozYYuAAAAAAACEEEEIIIIDNOOOOOOUUUUYsaaaaaaaceeeeiiiionoooooouuuuyy  ";

            // set up a dictionary, if ignore case, set the dict up with a new comparer

            StringComparer comparer = StringComparer.OrdinalIgnoreCase; // default is just Ordinal
            Dictionary <string, string> wordDictionary = new Dictionary <string, string>(comparer)
            {
                { "chevrolet", "Ford" },
                { "mAzDa", "BMW" },
                { "and and", "and" },  // will never match
            };

            // Need `System.ValueTuple` package to do this style of init
            // on v4.6 and below

            List <(string, string)> regxList = new List <(string, string)>
            {                      // Match, Replace
                ("BMW", "Fiat"),
                (@"\s+", " "),     // multi whitespace to 1 space
                (@"^\s*|\s*$", "") // trims leading/ending spaces
            };

            string sentence         = "¿¡Señor, the Chevrolet guys don't like     Dodge     guys, and and no one like MaZdA, Ola Senor?!    ";
            string expectedSentance = "Magoo the Ford guys don#t like Mercedes guys and and no one like Fiat Ola Magoo?!";

            Scrub st = new Scrub(sentence);

            // Set dictionary up, case insensitive match

            st.SetStringTranslator(wordDictionary, true);

            // set up character translators

            st.SetCharTranslator(matchChar, replaceChar);

            // set up list of regx replaces

            st.SetRegxTranslator(regxList);

            // add a string translation after the fact

            st.StringTransDict.Add("dodge", "Mercedes");

            // add a Regx translation after the fact

            st.RegxTuples.Add(("Senor", "Mr.Magoo"));

            // add a chracter Translation after the fact

            st.CharTransDict.Add('\'', '#');

            // so all sorts of stuff!

            st.Strip("[,]").MapChars().MapWords().RegxTranslate().Strip(@"Mr\.");

            Assert.AreEqual(expectedSentance, st.ToString());
        }
Exemple #5
0
        // ReSharper disable once UnusedParameter.Local
        private static void Main(string[] args)
        {
            // Map any character to any other character. The matchCarArray MUST be only
            // have unique characters. The replaceChar array will have the matching translated char.

            // The example below of accent chars, and their non-accented equiv
            // Both strings must be 1 to 1 mapping and size of strings. This was done as strings
            // to make it easier to deal with lots of characters.

            string matchChar   = "ŠŒŽšœžŸ¥µÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýÿ¡¿";
            string replaceChar = "SOZsozYYuAAAAAAACEEEEIIIIDNOOOOOOUUUUYsaaaaaaaceeeeiiiionoooooouuuuyy  ";

            // Set up a dictionary, if ignore case, set the dict up with a new comparer
            // These words are mapped to any instances of other words. See comments
            // on how this works vs regx, basically each word from a sentence is passed
            // to the dictionary for translation. Current or past changes are not candidates
            // for any further changes

            StringComparer comparer = StringComparer.OrdinalIgnoreCase; // default is just Ordinal
            Dictionary <string, string> wordDictionary = new Dictionary <string, string>(comparer)
            {
                { "chevrolet", "Ford" },
                { "mAzDa", "BMW" },
                { "and and", "and" }  // will never match
            };

            // NOTE : Need `System.ValueTuple` package to do this style of init on v4.6 and below.

            // Regx list each item is executed in order of the list.
            // First element is the Regx match string (C# style) and the second
            // is the replacement string if the pattern matches. Matches can affect the entire
            // string, and each subsequent match can as well.

            List <(string, string)> regxList = new List <(string, string)>
            {                      // Match, Replace
                ("BMW", "Fiat"),   // swaps 'BMW' (case dependent) with 'Fiat'
                (@"\s+", " "),     // multi whitespace to 1 space
                (@"^\s*|\s*$", "") // trims leading/ending spaces
            };

            // Test sentence with odd characters, spaces and other things needing scrubbing

            string sentence = "¿¡Señor, the Chevrolet guys don't like     Dodge     guys, and and no one like MaZdA, Ola Senor?!    ";

            // Dump the orig string

            Console.WriteLine("The Sentence : >{0}<", sentence);

            Scrub st = new Scrub(sentence);

            // Set dictionary up, case insensitive match

            st.SetStringTranslator(wordDictionary, true);

            // set up character translators

            st.SetCharTranslator(matchChar, replaceChar);

            // set up list of regx replaces

            st.SetRegxTranslator(regxList);

            // add a string translation after the fact

            st.StringTransDict.Add("dodge", "Mercedes");

            // add a Regx translation after the fact

            st.RegxTuples.Add(("Senor", "Mr.Magoo"));

            // add a chracter Translation after the fact

            st.CharTransDict.Add('\'', '#');

            // so all sorts of stuff!

            string translated = st.Strip("[,]").MapChars().MapWords().RegxTranslate().Strip(@"Mr\.").ToString();

            // Should be something like the string below -
            // Magoo the Ford guys don#t like Mercedes guys and and no one like Fiat Ola Magoo?!

            Console.WriteLine("Translated   : >{0}<", translated);

            // reset the string with some emails
            st.Set("[email protected] is sending an email to [email protected]");

            translated = st.RegxDefined("Email", "**Email Removed**").ToString();

            Console.WriteLine("Masked   : >{0}<", translated);

            st.Set(" 前に来た時は北側からで、当時の光景はいまでも思い出せる。 Even now I remember the scene I saw approaching the city from the north.  青竜山脈から流れる川が湖へと流れこむ様、湖の中央には純白のホ");
            translated = st.RegxDefined("NonAscii", string.Empty).ToString();

            Console.WriteLine("To all ASCII : >{0}<", translated);

            // reset the string with some emails
            st.Set(@"<h1>Title</h1><script>var a=1; \\comment</script> Not In Script Tags");

            translated = st.RegxDefined("ScriptTags", string.Empty).RegxDefined("TagsSimple", string.Empty).ToString();

            Console.WriteLine("Strip Script and Tags   : >{0}<", translated);

            // reset and set up a predefined match pattern and set regx case sensitivity
            st.Set("wtf does RemoveWTF do? Is WtF Case SeNsItIvE?");
            st.RegxMatchesDefined.Add("RemoveWTF", @"(wtf)|(what the)\s+(hell|$hit)");

            translated = st.RegxIgnoreCase().RegxDefined("RemoveWTF", "XXX").ToString();
            Console.WriteLine("New Pre-defined Match   : >{0}<", translated);
        }