/// <summary> /// Returns the percentage (0.0 to 1.0) of characters in a string, that can be written in a specified script name. /// Common characters (e.g. digits, space) will pass the test for any script, unless strict is set to true. /// Null or empty strings will always return 1 /// </summary> /// <param name="testText">Text to evaluate</param> /// <param name="scriptName">Short or long script name (e.g.'Latn', 'Latin')</param> /// <param name="strict">Common characters are not counted as belonging to the script</param> /// <param name="applyExtendedProperties">If a common character has extended properties limiting it to a list of scripts and none of them matches the scriptName parm, then the common character does not match. Ignored if strict parm is true </param> /// <returns>value between 0.0 and 1.0, 1.0 for full fit</returns> /// <exception cref="System.ArgumentException">Thrown when an invalid scriptName is passed</exception> static public float ProbablyInScript(string testText, string scriptName, bool strict = false, bool applyExtendedProperties = true) { // The main difference to GetUsedScripts is in case of a text built only with common code points: // here we would return 1.0 for ANY specified scriptName, while GetUsedScripts() would return an empty result set (since it // no specific script can be detected) if (testText == null || testText.Length == 0) { return(1); } Script sn = scripts .Where(n => n.shortName.ToLower() == scriptName.ToLower() || n.longName.ToLower() == scriptName.ToLower()).FirstOrDefault(); if (sn == null) { throw new ArgumentException("Invalid short or long scriptName supplied", scriptName); } string shortName = sn.shortName; // for logic and technical details, see http://www.unicode.org/reports/tr24/ int inScriptCount = 0; CodepointScript lastCps = null; // for inheritance //foreach (char c in testText) for (int charIndex = 0; charIndex < testText.Length; charIndex++) { // .net/windows hold characters as utf16. Unicode codepoints > 0xffff are represented as // two characters (using surrogates), therefor we cannot just loop through the characters and use their numeric value // (string length property grows accordingly) int codePoint = char.ConvertToUtf32(testText, charIndex); if (codePoint > 0xffff) { charIndex++; } var cps = codepointScripts.Where(cs => cs.rangeStart <= codePoint && cs.rangeEnd >= codePoint).FirstOrDefault(); if (cps == null) // not in table, implicitely Unknown and therefore not in script, this is a mismatch { continue; } if (cps.script.type == ScriptType.Unknown) // implicitely Unknown, not in script, this is a mismatch { continue; } if (cps.script.type == ScriptType.Common) { // most common code points can be used in any script, so this is a match, unless strict parm is set to true // but some common code point have extended scripts property, which says in which limited set of scripts the common might be used if (strict) { continue; // not a match } if (applyExtendedProperties) { var cpsExtended = codepointScriptsExtended.Where(cs => cs.rangeStart <= codePoint && cs.rangeEnd >= codePoint).FirstOrDefault(); if (cpsExtended != null && !cpsExtended.scriptNamesShort.Contains(shortName)) { continue; // not a match } } inScriptCount++; //match continue; } if (cps.script.type == ScriptType.Inherited) { // inherit from preceeding character if (lastCps == null) // inherited as first char should not happen in real written text, see this as a mismatch { continue; } else { // though there are a few cases of inherited chars with extended properties, this is // meaningless in the context of this method cps = lastCps; } } if (cps.script.shortName == shortName) { inScriptCount++; } lastCps = cps; } return((float)inScriptCount / testText.Length); }
static private void LoadFromText() { var lines = File.ReadAllLines("sourceData/PropertyValueAliases.txt"); var index = 0; foreach (var line in lines) { //sc ; Aghb ; Caucasian_Albanian if (line.StartsWith("sc")) { string[] parts = line.Split(';'); if (parts[0].Trim() != "sc") // check again, other thing could start with 'sc' in the future and diff white space { continue; } var shortName = parts[1].Trim(); var longName = parts[2].Trim(); scripts.Add(new Script { shortName = parts[1].Trim(), longName = parts[2].Trim(), tempIndex = index++ }); } } ////// scripts lines = File.ReadAllLines("sourceData/Scripts.txt"); CodepointScripts codepointScriptsRaw = new CodepointScripts(); CodepointScript lastCps = null; foreach (var line in lines) { if (line.Length == 0 || line.StartsWith('#')) { continue; } //0061..007A; Latin # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z //00AA; Latin # Lo FEMININE ORDINAL INDICATOR string[] parts = line.Split(';'); var r = parts[0]; int rangeStart, rangeEnd; if (r.Contains('.')) { string[] rparts = r.Split(".."); rangeStart = Convert.ToInt32(rparts[0].Trim(), 16);// = int.Parse("x" + rparts[0]); rangeEnd = Convert.ToInt32(rparts[1].Trim(), 16); } else { rangeStart = rangeEnd = Convert.ToInt32(r.Trim(), 16); } var scriptNameLong = parts[1].Split('#')[0].Trim(); var script = scripts.Where(sn => sn.longName == scriptNameLong).First(); var cps = new CodepointScript { rangeStart = rangeStart, rangeEnd = rangeEnd, script = script }; codepointScriptsRaw.Add(cps); } codepointScriptsRaw.Sort((item1, item2) => item1.rangeStart.CompareTo(item2.rangeStart)); lastCps = null; foreach (var cps in codepointScriptsRaw) { /* scripts.txt includes many ranges that could be expressed as part of a larger contigous range. * The reason for that is that the smaller ranges have also some data attributes written in the comment, which * we do not use. Therefore, where possible, extend ranges instead of adding a new ones. This reduces number * of ranges by about 50%. * * example: * 0020 ; Common # Zs SPACE * 0021..0023 ; Common # Po [3] EXCLAMATION MARK..NUMBER SIGN * 0024 ; Common # Sc DOLLAR SIGN * can be combined into a range 0020..0024, Common * */ if (lastCps != null && cps.rangeStart == lastCps.rangeEnd + 1 && cps.script == lastCps.script) { lastCps.rangeEnd = cps.rangeEnd; } else { var newCps = new CodepointScript { rangeStart = cps.rangeStart, rangeEnd = cps.rangeEnd, script = cps.script }; codepointScripts.Add(cps); lastCps = cps; } } ///////// extended ///////////// lines = File.ReadAllLines("sourceData/ScriptExtensions.txt"); CodepointScriptsExtended codepointScriptsExtendedRaw = new CodepointScriptsExtended(); CodepointScriptExtended lastCpsExt = null; foreach (var line in lines) { if (line.Length == 0 || line.StartsWith('#')) { continue; } //102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK //102E1..102FB ; Arab Copt # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED string[] parts = line.Split(';'); var r = parts[0]; int rangeStart, rangeEnd; if (r.Contains('.')) { string[] rparts = r.Split(".."); rangeStart = Convert.ToInt32(rparts[0].Trim(), 16);// = int.Parse("x" + rparts[0]); rangeEnd = Convert.ToInt32(rparts[1].Trim(), 16); } else { rangeStart = rangeEnd = Convert.ToInt32(r.Trim(), 16); } var nameSection = parts[1].Split('#')[0]; var scriptNamesShort = nameSection.Split(' ', StringSplitOptions.RemoveEmptyEntries); var scriptNamesShortList = new List <string>(scriptNamesShort); /* now comes a bit complex thing: for later algorithm performance we want to add the extended script names to the codepointScripts. But * it happens that a range in extended does not match a full common range in codepointScripts. * Example: * in codepointScripts: * rangeStart = 0x951, rangeEnd = 0x954, scriptNameShort = "Zinh" * in codepointScriptsExtendedRaw: * rangeStart = 0x952, rangeEnd = 0x952, scriptNamesShort = new string[]{"Beng", "Deva", "Gran", "Gujr", "Guru", "Knda", "Latn", "Mlym", "Orya", "Taml", "Telu", "Tirh", }}, * * * In that case we need to split a range. in the above example: * rangeStart = 0x951, rangeEnd = 0x951, scriptNameShort = "Zinh", * rangeStart = 0x952, rangeEnd = 0x952, scriptNameShort = "Zinh", scriptNamesShort = new string[]{"Beng", "Deva", "Gran", "Gujr", "Guru", "Knda", "Latn", "Mlym", "Orya", "Taml", "Telu", "Tirh", }}, * rangeStart = 0x953, rangeEnd = 0x954, scriptNameShort = "Zinh" * * */ var cpsExt = new CodepointScriptExtended { rangeStart = rangeStart, rangeEnd = rangeEnd, scriptNamesExtendedShort = scriptNamesShortList }; codepointScriptsExtendedRaw.Add(cpsExt); } codepointScriptsExtendedRaw.Sort((item1, item2) => item1.rangeStart.CompareTo(item2.rangeStart)); lastCpsExt = null; foreach (var cpsExt in codepointScriptsExtendedRaw) { /* scripts.txt includes many ranges that could be expressed as part of a larger contigous range. * The reason for that is that the smaller ranges have also some data attributes written in the comment, which * we do not use. Therefore, where possible, extend ranges instead of adding a new ones. This reduces number * of ranges by about 50%. * * example: * 102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK * 102E1..102FB ; Arab Copt # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED * can be combined into a range 102E0..102FB, Arab Copt * */ if (lastCpsExt != null && cpsExt.rangeStart == lastCpsExt.rangeEnd + 1 && lastCpsExt.scriptNamesExtendedShort.SequenceEqual(cpsExt.scriptNamesExtendedShort)) { lastCpsExt.rangeEnd = cpsExt.rangeEnd; } else { var newCpsExt = new CodepointScriptExtended { rangeStart = cpsExt.rangeStart, rangeEnd = cpsExt.rangeEnd, scriptNamesExtendedShort = cpsExt.scriptNamesExtendedShort }; codepointScriptsExtended.Add(newCpsExt); lastCpsExt = newCpsExt; } } }
/// <summary> /// Return a list of possible "writing scripts" (like 'Latin', 'Arabic') that might have been used to write the specified text, together with a probablity for each /// Multiple scripts may be returned if a text either is composed of mixed scripts OR if only codePoints where used that belong /// to multiple scripts. /// An empty list will be returned if the string is null or empty or if no script at all could be detected (such as "123," which only contains 'common' codepoints) /// /// </summary> /// <param name="testText">Text to evaluate</param> /// <param name="ignoreInherited">If true: special characters that inherit their script from the preceeding character are not counted</param> /// <returns>Result list</returns> static public Results GetUsedScripts(string testText, bool ignoreInherited = true /*, bool useExtendedProperties = false*/) { // for logic and technical details, see http://www.unicode.org/reports/tr24/ if (testText == null || testText.Length == 1) { return(new Results()); } int[] buckets = new int[scripts.Length]; int totalRelevantCharacters = 0; CodepointScript lastCps = null; // for inheritance //foreach (char c in testText) for (int charIndex = 0; charIndex < testText.Length; charIndex++) { //var codePoint = Convert.ToInt32(c); // .net/windows hold characters as utf16. Unicode codepoints > 0xffff are represented as // two characters (using surrogates), therefor we cannot just loop through the characters and use their always 16 bit numeric value // (string length property grows accordingly) int codePoint = char.ConvertToUtf32(testText, charIndex); if (codePoint > 0xffff) { charIndex++; } var cps = codepointScripts.Where(cs => cs.rangeStart <= codePoint && cs.rangeEnd >= codePoint).FirstOrDefault(); if (cps == null) // not in table means implicitely ScriptShortUnknown { continue; } if (cps.script.type == ScriptType.Unknown) // explicitly set to ScriptShortUnknown { continue; } if (cps.script.type == ScriptType.Common) { continue; } if (cps.script.type == ScriptType.Inherited) { if (ignoreInherited) { continue; } if (lastCps == null) // should not happen in real written text { continue; } else { cps = lastCps; } } lastCps = cps; totalRelevantCharacters++; buckets[cps.script.tempIndex]++; } Results results = new Results(); for (int i = 0; i < buckets.Length; i++) { var bucket = buckets[i]; if (bucket > 0) { float p = (float)bucket / totalRelevantCharacters; var scriptName = scripts.Where(sn => sn.tempIndex == i).First(); Console.WriteLine($"script {scriptName.longName}: {p}%"); results.Add(new Result { scriptNameShort = scriptName.shortName, scriptNameLong = scriptName.longName, probabilty = p }); } } results.Sort((item1, item2) => item2.probabilty.CompareTo(item1.probabilty)); // reverse sort, highest probability first return(results); }