/// <summary> /// Attempts to parse a character set, given the input <paramref name="input"/> /// and starting from the start position <paramref name="p"/>. The position will /// be updated if the parse is successful and then point to the first character /// after the closing bracket in the input. /// <param name="input">The input string for the parser</param> /// <param name="p">The current position in the input string</param> /// </summary> /// <returns>A character set, if the parse is successful. If no valid character /// set can be parsed, an exception will be thrown.</returns> public static Core.CharacterSet Parse(string input, ref int p) { ParsePosition position = new ParsePosition(p); CharacterSetParser parser = new CharacterSetParser(input, position); Core.CharacterSet result = parser.Parse(); p = position.Position; return(result); }
public static Recognizer Create(System.Globalization.CultureInfo culture, int priority) { try { // TODO support non-blank languages for unit separation MeasureRegexRecognizer result = new MeasureRegexRecognizer(100, "DEFAULT_MEASURE_RECOGNIZER", culture); Core.CharacterSet first = null; // augmentation doesn't change FIRST() List <string> patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.CultureDefault, out first); AugmentPatterns(patterns, culture); foreach (string p in patterns) { // use the same first for all patterns (the number regex pattern computer returns just one pattern anyway) result.Add(p, first, 2); } SeparatorCombination defaultSc = new SeparatorCombination(culture, false); if (defaultSc.IsSwappable()) { patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.Swapped, out first); AugmentPatterns(patterns, culture); foreach (string p in patterns) { result.Add(p, first, 1); } } if (NumberPatternComputer.DoAddENUSSeparators(culture)) { patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.EnUS, out first); AugmentPatterns(patterns, culture); foreach (string p in patterns) { result.Add(p, first, 0); } } result.OnlyIfFollowedByNonwordCharacter = true; // otherwise "123 ABC" will be recognized as "123 A" "BC" in Japanese return(result); } catch // (System.Exception e) { return(null); } }
public Recognizer(Core.Tokenization.TokenType t, int priority, string tokenClassName, string recognizerName, bool autoSubstitutable) { _Type = t; _Priority = priority; _TokenClassName = tokenClassName; _RecognizerName = recognizerName; _AdditionalTerminators = null; _IsFallbackRecognizer = false; _OverrideFallbackRecognizer = false; _AutoSubstitutable = autoSubstitutable; }
public static Recognizer Create(System.Globalization.CultureInfo culture, Core.Wordlist currencySymbols, int priority) { try { // TODO support non-blank languages for unit separation Core.CharacterSet curFirst = null; string currenciesRx = currencySymbols.GetRegularExpression(out curFirst); int currencyPattern = culture.NumberFormat.CurrencyPositivePattern; bool currencyPrecedesNumber = (currencyPattern % 2) == 0; bool currencyIsSeparated = (currencyPattern >= 2); CurrencyRegexRecognizer result = new CurrencyRegexRecognizer(100, "DEFAULT_CURRENCY_RECOGNIZER", culture); Core.CharacterSet first = null; // augmentation doesn't change FIRST() // TODO use currency pattern instead of number pattern? List <string> patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.CultureDefault, out first); if (currencyPrecedesNumber) { first.Add(curFirst); } AugmentPatterns(patterns, currenciesRx, culture); foreach (string p in patterns) { // use the same first for all patterns (the number regex pattern computer returns just one pattern anyway) result.Add(p, first, 2); } /* * Be strict for currencies (only flexible for measurements and numbers) * * if (NumberRegexRecognizer.CanSwapSeparators(culture)) * { * patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.Swapped, out first); * AugmentPatterns(patterns, currenciesRx, culture); * * foreach (string p in patterns) * { * result.Add(p, first, 1); * } * } * if (NumberRegexRecognizer.AddENUSSeparators(culture)) * { * patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.EnUS, out first); * AugmentPatterns(patterns, currenciesRx, culture); * * foreach (string p in patterns) * { * result.Add(p, first, 0); * } * } */ result.OnlyIfFollowedByNonwordCharacter = Core.CultureInfoExtensions.UseBlankAsWordSeparator(culture); return(result); } catch // (System.Exception e) { return(null); } }
private Core.CharacterSet Parse() { Core.CharacterSet result = new Core.CharacterSet(); Expect('['); if (LookingAt() == '^') { result.Negated = true; _Position.Advance(); } const int startState = 0; const int charclassState = 1; const int finalState = 99; int state = startState; while (state != finalState) { char lookAt = LookingAt(); switch (state) { case startState: switch (lookAt) { case '[': // character class _Position.Advance(); if (LookingAt() == ':') { _Position.Advance(); state = charclassState; } else if (LookingAt() != '\0') { result.Add('['); // don't advance } break; case '\0': // unexpected end throw new Core.LanguagePlatformException(Core.ErrorCode.TokenizerInvalidCharacterSet, _Input); case ']': // right bracket - move on to final state _Position.Advance(); state = finalState; break; default: { // a plain character (escaped, simple, or Unicode hex) char lower = ScanChar(); char upper = '\0'; if (LookingAt() == '-') { _Position.Advance(); if (LookingAt() == ']') { // dash at end of input - add literal dash to charset, skip ], and go to final state result.Add(lower); result.Add('-'); _Position.Advance(); state = finalState; } else { upper = ScanChar(); result.Add(lower, upper); } } else { result.Add(lower); state = startState; } } break; } break; case charclassState: // just got '[' followed by ':' if (LookingAt() == '\0') { throw new Core.LanguagePlatformException(Core.ErrorCode.TokenizerInvalidCharacterSet, _Input); } else { StringBuilder className = new StringBuilder(); while (char.IsLetter(LookingAt())) { className.Append(LookingAt()); _Position.Advance(); } Nullable <System.Globalization.UnicodeCategory> category; if (className.Length == 0 || (category = Core.CharacterProperties.GetUnicodeCategoryFromName(className.ToString().ToLower())) == null || !category.HasValue) { throw new Core.LanguagePlatformException(Core.ErrorCode.TokenizerInvalidCharacterSet, _Input); } else { result.Add(category.Value); } Expect(':'); Expect(']'); state = startState; } break; default: // unexpected state throw new Core.LanguagePlatformException(Core.ErrorCode.TokenizerInvalidCharacterSet, _Input); } } return(result); }