/// <summary> /// Is character a Variation Selector? [https://en.wikipedia.org/wiki/Variation_Selectors_(Unicode_block)] /// Note that these are included in IsCombiner as well. /// </summary> //public static bool IsVariationSelector(this char c) => c.IsInRange(0xfe00, 0xfe0f); /// <summary> /// Helper to determine whether a character is a numeral. /// This includes numeral characters that are not classified as such in Unicode, /// such as Chinese numbers. /// This is meant for FactoredSegmenter, which uses this to prevent numeral characters /// from being merged in SentencePiece. public static bool IsNumeral(this char c) { // @BUGBUG: currently known failures: // - Arabic fractions: ٠٫٢٥ // Chinese numeral letters are not classified as digits in Unicode if (ScriptHelpers.ChineseDigits.Contains(c)) { return(true); } else { return(Unicode.GetUnicodeMajorDesignation(c) == 'N'); } }
/// <summary> /// Get script designators for each character in a line. /// This function handles surrogate pairs and combining marks. --@TODO: ...not yet, actually /// The function can optionally operate on a substring. /// </summary> //public static Unicode.Script[] GetScripts(string line, int startIndex = 0, int length = int.MaxValue) //{ // if (length == int.MaxValue) // length = line.Length; // var scripts = new Unicode.Script[length]; // for (var i = 0; i < length; i++) // { // // @TODO: Handle surrogates // char c = line[startIndex + i]; // if (c.IsCombiner() && i > 0) // scripts[i] = scripts[i - 1]; // else // scripts[i] = Unicode.GetScript(c); // } // return scripts; //} /// <summary> /// Simplistic word-boundary detector. /// This function attempts to detect word boundaries that can be detected in a language-independent /// fashion from the surface form, and without additional knowledge sources. /// I.e. it looks for a change in script and some changes in Unicode character designation. /// This does not detect word breaks in continuous scripts, which require additional knowledge sources. /// This function handles these special cases: /// - some known allowed punctuation between characters, such as ' in words and . in numbers /// @TODO: This rule may not apply to all scripts. /// - surrogate pairs --@TODO /// - combiners inherit the script of the character to the left /// - combiners are classified as the char type (major designation) they are "typically" /// applied to (not depending on actual char). /// This is needed so that combiners that end up as single SentencePieces are classifyable. /// Any error this causes must be learned by the model. /// - designation changes only are a boundary if a letter or a number is on either side, /// but e.g. not a punctuation symbol next to a space or math symbol /// - (special rule: Hiragana is not split from Kanji. Currently this rule is disabled.) /// Each space gives rise to two boundaries (one on each side). /// It returns a cut list. An empty string is not cut. /// </summary> public static IList <int> DetectUnambiguousWordBreaks(string line) // @TODO: Better name for this? { // First, determines the major Unicode designation and script for each character, but with modifications, // for purpose of simple word breaking: // - allowed punctuation marks inside words are flipped to 'L' // - allowed punctuation marks inside numbers are flipped to 'N' // - unambiguous CJK number letters are flipped to 'N' // - combining marks carry over both designation and script from their main character var scripts = new Unicode.Script[line.Length]; var designations = new char[line.Length]; for (var i = 0; i < line.Length; i++) { var c = line[i]; // @TODO: handle surrogate pairs var m = Unicode.GetUnicodeMajorDesignation(c); var s = Unicode.GetScript(c); // special case: consider unambiguous CJK number symbols as numerals if (c.IsNumeral()) { designations[i] = 'N'; } // special case: combining marks carry over main character's script, and are classified as their most likely use (for consistency) else if (m == 'M') { m = c.GetCombinerTypicalMajorDesignation(); if (i > 0) { s = scripts[i - 1]; } } designations[i] = m; scripts[i] = s; // special case: allowed punctuation inside a word --@TODO: Likely script dependent, maybe language dependent if (i - 2 >= 0 && designations[i] == 'L' && designations[i - 2] == 'L' && IsValidPuncInsideWord(line[i - 1])) { designations[i - 1] = 'L'; } // special case: allowed punctuation inside a number --@TODO: Likely script dependent, maybe language-locale dependent else if (i - 2 >= 0 && designations[i] == 'N' && designations[i - 2] == 'N' && IsValidPuncInsideNumber(line[i - 1])) { designations[i - 1] = 'N'; } // @TODO: double-check handling of space characters: non-breaking space; optional hyphen } // This function operates on a string, so we can handle the case of Unicode.Script 1 - Common - Unicode.Script 2 // This presently breaks this as (Unicode.Script 1 - Common, Unicode.Script 2). // Without further knowledge, we can only make an arbitrary hard choice here. // This is used by FactoredSegmenter, where that is OK because characters in Common are // typically broken off anyways. if (line.Length == 0) // graceful exit in case of empty input { return new List <int> { 0, 0 } } ; // empty input is not cut var cutList = new List <int>(200) { 0 }; // (0=line start, which the resulting cut list must include) var lastNonCommonScript = scripts[0]; //if (lastNonCommonScript == Unicode.Script.Hiragana) // lastNonCommonScript = Unicode.Script.Han; // no boundary between Kanji and Hiragana for (var pos = 1; pos < line.Length; pos++) { // detect change in character designation // - break at number boundaries // - add number factor // - can numbers be part of words that need to be kept together for determining word-level factors? // - break at word boundaries // - letter/non-letter transitions // - don't break apostrophes and hyphens with letters on both sides // - break at script boundaries bool atDesignationChange = (designations[pos - 1] != designations[pos] && (designations[pos - 1] == 'N' || designations[pos] == 'N' || designations[pos - 1] == 'L' || designations[pos] == 'L')); // detect script change var thisScript = scripts[pos]; //if (thisScript == Unicode.Script.Hiragana) // the jury is still out whether we should do this or not // thisScript = Unicode.Script.Han; bool atScriptChange = lastNonCommonScript != thisScript && thisScript != Unicode.Script.Common; // Note: If there is a script change across Common, we choose one arbitrarily. if (thisScript != Unicode.Script.Common || atDesignationChange) // condition 'atDesignationChange' is for back compat only; maybe not needed { lastNonCommonScript = thisScript; } // add cut point if one was found if (atDesignationChange || atScriptChange) { cutList.Add(pos); } } cutList.Add(line.Length); return(cutList); }