Exemplo n.º 1
0
        /// <summary>
        ///     Removes the specified vishraams from a string.
        /// </summary>
        /// <param name="text">The text to remove vishraams from.</param>
        /// <param name="vishraams">The vishraams to remove. Defaults to all.</param>
        /// <returns>A vishraam-less Gurmukhi string.</returns>
        public static string StripVishraams(string text,
                                            Vishraam vishraams = Vishraam.Heavy | Vishraam.Medium | Vishraam.Light)
        {
            var vishraamMapping = Mapping.VishraamMapping();

            var vishraamOptions = Enum.GetValues(vishraams.GetType())
                                  .Cast <Enum>()
                                  .Where(vishraams.HasFlag)
                                  .Select(value => vishraamMapping[value.ToString()].ToString());

            return(Regex.Replace(text, StringRegexHelper.GetRegexClass(vishraamOptions), ""));
        }
Exemplo n.º 2
0
        /// <summary>
        ///    Strips line endings from any Gurmukhi or translation string.
        ///    Accepts both Unicode and ASCII input.
        ///    Useful for generating accurate first letters or modifying non-Gurbani for better display.
        ///    *Not* designed for headings or Sirlekhs.
        /// </summary>
        /// <param name="text">The text to stip endings from.</param>
        /// <returns>A ending-less version of the text.</returns>
        public static string StripEndings(string text)
        {
            // Line endings in both ASCII, Unicode, and English
            var endingClass = StringRegexHelper.GetRegexClass(new[] { "।", "॥", "]", "[", "|" });
            // Sometimes translation line endings begin with these characters, before numbers
            var optionalEndingClass = StringRegexHelper.GetRegexClass(new[] { "(" });
            // Remove any broken endings
            var brokenEndingClass = StringRegexHelper.GetRegexGroup(new[] { "()" });

            // All numbers in ASCII, Unicode
            var numbers = Enumerable.Range(0, 10).ToArray()
                          .Select(i => i.ToString())
                          .ToArray();

            var numberClass = StringRegexHelper.GetRegexClass(
                numbers
                .Concat(numbers.Select(ToUnicodeGurmukhi))
                .Concat(numbers.Select(ToUnicodeGurmukhi).Select(ToHindi))
                );

            // Rahao in English, ASCII, Unicode
            var pauseGroup = StringRegexHelper
                             .GetRegexGroup(new[] { "ਰਹਾਉ", ToAsciiGurmukhi("ਰਹਾਉ"), "Pause" });

            // Matchers to strip out of input string
            var matchers = new[]
            {
                // Any pause (ending + pause word) => match the rest of the line
                $" ?{endingClass} ?{pauseGroup}.*",
                // Any ending followed by any number => match the rest of the line
                $" ?({endingClass}|{optionalEndingClass}){numberClass}.*",
                // Any sequence at the end of a line with numbers, periods, and spaces beginning with a number
                $" ?{numberClass}({numberClass}|[. ])*$",
                // Clean up any lingering ending characters
                $" ?{brokenEndingClass}",
                $" ?{endingClass}",
            };

            return(matchers.Aggregate(text, (a, b) =>
                                      Regex.Replace(a, b, "").Trim()));
        }
        /// <summary>
        /// Represents text in syllables according to Sanskrit prosody, Pingala, Matra/Meter/Morae
        /// </summary>
        /// <param name="text">The string to convert</param>
        /// <returns>A syllabic representation of 1"s (laghu/light/short) and 2"s (guru/heavy/long).</returns>
        public static string ToSyllabicSymbols(string text)
        {
            // These have no impact on weight, therfore remove before processing
            var zeroWeightSigns = new[]
            {
                "੍ਰ",
                "ੵ",
                "ੑ",
                "੍ਵ",
                "੍ਹ",
                "੍ਟ",
                "੍ਨ",
                "੍ਯ",
                "੍ਚ",
                "੍ਤ",
                "੍",
                "ਿ",
                "ੁ",
                "ਂ",
                "ਃ",
                "।",
                "॥",
                "☬",
                "਼",
                "❁",
            };

            // For the rest, need to analyze the string
            // Base characters counts as one syllable (light)
            // Base characters with any number of long (deeragh) sounds counts as two syllables (heavy)
            var syllableSymbols = new
            {
                Light         = "1",
                Deeragh       = "s",
                HeavySequence = "1s",
                Heavy         = "2",
            };

            // The list of base characters for analysis
            var baseCharacters = new[]
            {
                "ਇ",
                "ਉ",
                "ਙ",
                "ੳ",
                "ਅ",
                "ਬ",
                "ਭ",
                "ਚ",
                "ਛ",
                "ਦ",
                "ਧ",
                "ੲ",
                "ਡ",
                "ਢ",
                "ਗ",
                "ਘ",
                "ਹ",
                "ਜ",
                "ਝ",
                "ਕ",
                "ਖ",
                "ਲ",
                "ਲ਼",
                "ਮ",
                "ਨ",
                "ਪ",
                "ਫ",
                "ਤ",
                "ਥ",
                "ਰ",
                "ਸ",
                "ਸ਼",
                "ਟ",
                "ਠ",
                "ਵ",
                "ੜ",
                "ਣ",
                "ਯ",
                "ਜ਼",
                "ਗ਼",
                "ਖ਼",
                "ਫ਼",
                "ਞ",
            };

            // The list of long sound characters for analysis
            var deeraghModifiers = new[]
            {
                "ੰ",
                "੍ਹੂ",
                "ੀ",
                "ੂ",
                "ੇ",
                "ੈ",
                "ੋ",
                "ੌ",
                "ਾਂ",
                "ਾ",
                "ੱ",
            };

            // Some symbols represent multiple characters in a single unicode entity.
            // These should be considered unsplittable and unmodifiable by other vowels/signs.
            // The following represent a base character and deeragh in one unicode entity point.
            // Since author is unaware of whether these can be further modified with deeragh
            // they"ll be processed as a heavy sequence (base char + deeragh) in the mapping.
            // If that is not the case and these characters cannot have further deeragh modifiers,
            // then they can be safely mapped directly to syllableSymbol.heavy
            var twoUnitSyllables = new[] { "ਊ", "ਓ", "ਈ", "ਏ", "ਐ", "ਆ", "ਔ" };

            // Create a map for each character to a syllableSymbol, for further processing/analysis
            var syllabicMapping = new[]
            {
                new { Symbol = syllableSymbols.Light, GroupedCharacters = baseCharacters },
                new { Symbol = syllableSymbols.Deeragh, GroupedCharacters = deeraghModifiers },
                new { Symbol = syllableSymbols.HeavySequence, GroupedCharacters = twoUnitSyllables }
            }
            .SelectMany(sm =>
                        sm.GroupedCharacters.Select(c => new { Character = c, sm.Symbol }))
            // Add any missing mappings which do not fit in the above patterns
            // These rules are "as is", and are not modified in the reducer above
            .Append(new { Character = "ੴ", Symbol = "21 2221" }) // Ik Oankaar / ਇੱਕ ਓਅੰਕਾਰ
            .Append(new { Character = " ", Symbol = " " })       // Preserve spacing between words
            .ToDictionary(k => k.Character, v => v.Symbol);

            // Create Regex rules for replacements
            var zeroWeightSignsRegex        = new Regex(StringRegexHelper.GetRegexGroup(zeroWeightSigns));
            var multipleSpaceCharsRegex     = new Regex(" +");
            var multipleDeeraghSymbolsRegex = new Regex("s+");
            var heavySequenceRegex          = new Regex("1s");

            text = ToUnicodeGurmukhi(text);

            var result = string
                         .Join("", zeroWeightSignsRegex
                               .Replace(text, "")
                               .ToCharArray()
                               .Select(value =>
                                       syllabicMapping.TryGetValue(value.ToString(), out var val) ? val : " "));

            result = multipleSpaceCharsRegex.Replace(result, " ");
            result = multipleDeeraghSymbolsRegex.Replace(result, syllableSymbols.Deeragh);
            result = heavySequenceRegex.Replace(result, syllableSymbols.Heavy);

            return(result);
        }