Esempi in C# (CSharp) per CodepointScript

Linguaggio di programmazione: C# (CSharp)

Classe/tipologia: CodepointScript

Esempi su hotexamples.com: 3

CodepointScript in C# (CSharp): 3 esempi trovati. Questi sono i migliori esempi reali in C# (CSharp) per CodepointScript, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: UnicodeScriptDetectorNet.cs Progetto: DaniRK/UnicodeScriptDetectorNet

        /// <summary>
        /// Returns the percentage (0.0 to 1.0) of characters in a string, that can be written in a specified script name.
        /// Common characters (e.g. digits, space) will pass the test for any script, unless strict is set to true.
        /// Null or empty strings will always return 1
        /// </summary>
        /// <param name="testText">Text to evaluate</param>
        /// <param name="scriptName">Short or long script name (e.g.'Latn', 'Latin')</param>
        /// <param name="strict">Common characters are not counted as belonging to the script</param>
        /// <param name="applyExtendedProperties">If a common character has extended properties limiting it to a list of scripts and none of them matches the scriptName parm, then the common character does not match. Ignored if strict parm is true  </param>
        /// <returns>value between 0.0 and 1.0, 1.0 for full fit</returns>
        /// <exception cref="System.ArgumentException">Thrown when an invalid scriptName is passed</exception>
        static public float ProbablyInScript(string testText, string scriptName, bool strict = false, bool applyExtendedProperties = true)
        {
            // The main difference to GetUsedScripts is in case of a text built only with common code points:
            // here we would return 1.0 for ANY specified scriptName, while GetUsedScripts() would return an empty result set (since it
            // no specific script can be detected)

            if (testText == null || testText.Length == 0)
            {
                return(1);
            }

            Script sn = scripts
                        .Where(n => n.shortName.ToLower() == scriptName.ToLower() || n.longName.ToLower() == scriptName.ToLower()).FirstOrDefault();

            if (sn == null)
            {
                throw new ArgumentException("Invalid short or long scriptName supplied", scriptName);
            }

            string shortName = sn.shortName;


            // for logic and technical details, see http://www.unicode.org/reports/tr24/

            int inScriptCount = 0;

            CodepointScript lastCps = null; // for inheritance

            //foreach (char c in testText)
            for (int charIndex = 0; charIndex < testText.Length; charIndex++)
            {
                // .net/windows hold characters as utf16. Unicode codepoints > 0xffff are represented as
                // two characters (using surrogates), therefor we cannot just loop through the characters and use their numeric value
                // (string length property grows accordingly)

                int codePoint = char.ConvertToUtf32(testText, charIndex);
                if (codePoint > 0xffff)
                {
                    charIndex++;
                }

                var cps = codepointScripts.Where(cs => cs.rangeStart <= codePoint && cs.rangeEnd >= codePoint).FirstOrDefault();

                if (cps == null) // not in table, implicitely Unknown and therefore not in script, this is a mismatch
                {
                    continue;
                }

                if (cps.script.type == ScriptType.Unknown) // implicitely Unknown, not in script, this is a mismatch
                {
                    continue;
                }

                if (cps.script.type == ScriptType.Common)
                {
                    // most common code points can be used in any script, so this is a match, unless strict parm is set to true
                    // but some common code point have extended scripts property, which says in which limited set of scripts the common might be used

                    if (strict)
                    {
                        continue; // not a match
                    }
                    if (applyExtendedProperties)
                    {
                        var cpsExtended = codepointScriptsExtended.Where(cs => cs.rangeStart <= codePoint && cs.rangeEnd >= codePoint).FirstOrDefault();

                        if (cpsExtended != null && !cpsExtended.scriptNamesShort.Contains(shortName))
                        {
                            continue; // not a match
                        }
                    }

                    inScriptCount++; //match
                    continue;
                }

                if (cps.script.type == ScriptType.Inherited)
                {
                    // inherit from preceeding character

                    if (lastCps == null) // inherited as first char should not happen in real written text, see this as a mismatch
                    {
                        continue;
                    }
                    else
                    {
                        // though there are a few cases of inherited chars with extended properties, this is
                        // meaningless in the context of this method

                        cps = lastCps;
                    }
                }

                if (cps.script.shortName == shortName)
                {
                    inScriptCount++;
                }

                lastCps = cps;
            }

            return((float)inScriptCount / testText.Length);
        }

Esempio n. 2

Mostra file

File: Program.cs Progetto: DaniRK/UnicodeScriptDetectorNet

        static private void LoadFromText()
        {
            var lines = File.ReadAllLines("sourceData/PropertyValueAliases.txt");
            var index = 0;

            foreach (var line in lines)
            {
                //sc ; Aghb                             ; Caucasian_Albanian
                if (line.StartsWith("sc"))
                {
                    string[] parts = line.Split(';');

                    if (parts[0].Trim() != "sc") // check again, other thing could start with 'sc' in the future and diff white space
                    {
                        continue;
                    }

                    var shortName = parts[1].Trim();
                    var longName  = parts[2].Trim();

                    scripts.Add(new Script {
                        shortName = parts[1].Trim(), longName = parts[2].Trim(), tempIndex = index++
                    });
                }
            }

            ////// scripts

            lines = File.ReadAllLines("sourceData/Scripts.txt");
            CodepointScripts codepointScriptsRaw = new CodepointScripts();

            CodepointScript lastCps = null;

            foreach (var line in lines)
            {
                if (line.Length == 0 || line.StartsWith('#'))
                {
                    continue;
                }

                //0061..007A; Latin # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
                //00AA; Latin # Lo       FEMININE ORDINAL INDICATOR

                string[] parts = line.Split(';');
                var      r     = parts[0];

                int rangeStart, rangeEnd;

                if (r.Contains('.'))
                {
                    string[] rparts = r.Split("..");
                    rangeStart = Convert.ToInt32(rparts[0].Trim(), 16);// = int.Parse("x" + rparts[0]);
                    rangeEnd   = Convert.ToInt32(rparts[1].Trim(), 16);
                }
                else
                {
                    rangeStart = rangeEnd = Convert.ToInt32(r.Trim(), 16);
                }

                var scriptNameLong = parts[1].Split('#')[0].Trim();
                var script         = scripts.Where(sn => sn.longName == scriptNameLong).First();

                var cps = new CodepointScript {
                    rangeStart = rangeStart, rangeEnd = rangeEnd, script = script
                };

                codepointScriptsRaw.Add(cps);
            }

            codepointScriptsRaw.Sort((item1, item2) => item1.rangeStart.CompareTo(item2.rangeStart));

            lastCps = null;

            foreach (var cps in codepointScriptsRaw)
            {
                /* scripts.txt includes many ranges that could be expressed as part of a larger contigous range.
                 * The reason for that is that the smaller ranges have also some data attributes written in the comment, which
                 * we do not use. Therefore, where possible, extend ranges instead of adding a new ones. This reduces number
                 * of ranges by about 50%.
                 *
                 * example:
                 * 0020          ; Common # Zs       SPACE
                 * 0021..0023    ; Common # Po   [3] EXCLAMATION MARK..NUMBER SIGN
                 * 0024          ; Common # Sc       DOLLAR SIGN
                 * can be combined into a range 0020..0024, Common
                 *
                 */

                if (lastCps != null && cps.rangeStart == lastCps.rangeEnd + 1 && cps.script == lastCps.script)
                {
                    lastCps.rangeEnd = cps.rangeEnd;
                }
                else
                {
                    var newCps = new CodepointScript
                    {
                        rangeStart = cps.rangeStart,
                        rangeEnd   = cps.rangeEnd,
                        script     = cps.script
                    };

                    codepointScripts.Add(cps);

                    lastCps = cps;
                }
            }

            ///////// extended /////////////

            lines = File.ReadAllLines("sourceData/ScriptExtensions.txt");
            CodepointScriptsExtended codepointScriptsExtendedRaw = new CodepointScriptsExtended();

            CodepointScriptExtended lastCpsExt = null;

            foreach (var line in lines)
            {
                if (line.Length == 0 || line.StartsWith('#'))
                {
                    continue;
                }

                //102E0         ; Arab Copt # Mn       COPTIC EPACT THOUSANDS MARK
                //102E1..102FB  ; Arab Copt # No  [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED

                string[] parts = line.Split(';');
                var      r     = parts[0];

                int rangeStart, rangeEnd;

                if (r.Contains('.'))
                {
                    string[] rparts = r.Split("..");
                    rangeStart = Convert.ToInt32(rparts[0].Trim(), 16);// = int.Parse("x" + rparts[0]);
                    rangeEnd   = Convert.ToInt32(rparts[1].Trim(), 16);
                }
                else
                {
                    rangeStart = rangeEnd = Convert.ToInt32(r.Trim(), 16);
                }

                var nameSection          = parts[1].Split('#')[0];
                var scriptNamesShort     = nameSection.Split(' ', StringSplitOptions.RemoveEmptyEntries);
                var scriptNamesShortList = new List <string>(scriptNamesShort);

                /* now comes a bit complex thing: for later algorithm performance we want to add the extended script names to the codepointScripts. But
                 * it happens that a range in extended does not match a full common range in codepointScripts.
                 * Example:
                 * in codepointScripts:
                 *              rangeStart = 0x951, rangeEnd = 0x954, scriptNameShort = "Zinh"
                 * in codepointScriptsExtendedRaw:
                 *              rangeStart = 0x952, rangeEnd = 0x952, scriptNamesShort = new string[]{"Beng", "Deva", "Gran", "Gujr", "Guru", "Knda", "Latn", "Mlym", "Orya", "Taml", "Telu", "Tirh", }},
                 *
                 *
                 * In that case we need to split a range. in the above example:
                 *              rangeStart = 0x951, rangeEnd = 0x951, scriptNameShort = "Zinh",
                 *              rangeStart = 0x952, rangeEnd = 0x952, scriptNameShort = "Zinh", scriptNamesShort = new string[]{"Beng", "Deva", "Gran", "Gujr", "Guru", "Knda", "Latn", "Mlym", "Orya", "Taml", "Telu", "Tirh", }},
                 *              rangeStart = 0x953, rangeEnd = 0x954, scriptNameShort = "Zinh"
                 *
                 *
                 */

                var cpsExt = new CodepointScriptExtended
                {
                    rangeStart = rangeStart, rangeEnd = rangeEnd, scriptNamesExtendedShort = scriptNamesShortList
                };

                codepointScriptsExtendedRaw.Add(cpsExt);
            }

            codepointScriptsExtendedRaw.Sort((item1, item2) => item1.rangeStart.CompareTo(item2.rangeStart));

            lastCpsExt = null;

            foreach (var cpsExt in codepointScriptsExtendedRaw)
            {
                /* scripts.txt includes many ranges that could be expressed as part of a larger contigous range.
                 * The reason for that is that the smaller ranges have also some data attributes written in the comment, which
                 * we do not use. Therefore, where possible, extend ranges instead of adding a new ones. This reduces number
                 * of ranges by about 50%.
                 *
                 * example:
                 * 102E0         ; Arab Copt # Mn       COPTIC EPACT THOUSANDS MARK
                 * 102E1..102FB  ; Arab Copt # No  [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
                 * can be combined into a range 102E0..102FB, Arab Copt
                 *
                 */

                if (lastCpsExt != null && cpsExt.rangeStart == lastCpsExt.rangeEnd + 1 && lastCpsExt.scriptNamesExtendedShort.SequenceEqual(cpsExt.scriptNamesExtendedShort))
                {
                    lastCpsExt.rangeEnd = cpsExt.rangeEnd;
                }
                else
                {
                    var newCpsExt = new CodepointScriptExtended
                    {
                        rangeStart = cpsExt.rangeStart,
                        rangeEnd   = cpsExt.rangeEnd,
                        scriptNamesExtendedShort = cpsExt.scriptNamesExtendedShort
                    };

                    codepointScriptsExtended.Add(newCpsExt);

                    lastCpsExt = newCpsExt;
                }
            }
        }

Esempio n. 3

Mostra file

File: UnicodeScriptDetectorNet.cs Progetto: DaniRK/UnicodeScriptDetectorNet

        /// <summary>
        /// Return a list of possible "writing scripts" (like 'Latin', 'Arabic') that might have been used to write the specified text, together with a probablity for each
        /// Multiple scripts may be returned if a text either is composed of mixed scripts OR if only codePoints where used that belong
        /// to multiple scripts.
        /// An empty list will be returned if the string is null or empty or if no script at all could be detected (such as "123," which only contains 'common' codepoints)
        ///
        /// </summary>
        /// <param name="testText">Text to evaluate</param>
        /// <param name="ignoreInherited">If true: special characters that inherit their script from the preceeding character are not counted</param>
        /// <returns>Result list</returns>
        static public Results GetUsedScripts(string testText, bool ignoreInherited = true /*, bool useExtendedProperties = false*/)
        {
            // for logic and technical details, see http://www.unicode.org/reports/tr24/

            if (testText == null || testText.Length == 1)
            {
                return(new Results());
            }

            int[] buckets = new int[scripts.Length];
            int   totalRelevantCharacters = 0;

            CodepointScript lastCps = null; // for inheritance

            //foreach (char c in testText)
            for (int charIndex = 0; charIndex < testText.Length; charIndex++)
            {
                //var codePoint = Convert.ToInt32(c);
                // .net/windows hold characters as utf16. Unicode codepoints > 0xffff are represented as
                // two characters (using surrogates), therefor we cannot just loop through the characters and use their always 16 bit numeric value
                // (string length property grows accordingly)

                int codePoint = char.ConvertToUtf32(testText, charIndex);
                if (codePoint > 0xffff)
                {
                    charIndex++;
                }

                var cps = codepointScripts.Where(cs => cs.rangeStart <= codePoint && cs.rangeEnd >= codePoint).FirstOrDefault();

                if (cps == null) // not in table means implicitely ScriptShortUnknown
                {
                    continue;
                }

                if (cps.script.type == ScriptType.Unknown) // explicitly set to ScriptShortUnknown
                {
                    continue;
                }

                if (cps.script.type == ScriptType.Common)
                {
                    continue;
                }

                if (cps.script.type == ScriptType.Inherited)
                {
                    if (ignoreInherited)
                    {
                        continue;
                    }

                    if (lastCps == null) // should not happen in real written text
                    {
                        continue;
                    }
                    else
                    {
                        cps = lastCps;
                    }
                }

                lastCps = cps;
                totalRelevantCharacters++;

                buckets[cps.script.tempIndex]++;
            }

            Results results = new Results();

            for (int i = 0; i < buckets.Length; i++)
            {
                var bucket = buckets[i];
                if (bucket > 0)
                {
                    float p          = (float)bucket / totalRelevantCharacters;
                    var   scriptName = scripts.Where(sn => sn.tempIndex == i).First();

                    Console.WriteLine($"script {scriptName.longName}: {p}%");

                    results.Add(new Result
                    {
                        scriptNameShort = scriptName.shortName,
                        scriptNameLong  = scriptName.longName,
                        probabilty      = p
                    });
                }
            }

            results.Sort((item1, item2) => item2.probabilty.CompareTo(item1.probabilty)); // reverse sort, highest probability first

            return(results);
        }