string RuntimePath() { if (_runtimePathCharacterSet == null) { _runtimePathCharacterSet = new CharacterSet(identifierCharSet); _runtimePathCharacterSet.Add('-'); // for c-0, g-0 etc _runtimePathCharacterSet.Add('.'); } return(ParseCharactersFromCharSet(_runtimePathCharacterSet)); }
// Note: we allow identifiers that start with a number, // but not if they *only* comprise numbers protected string Identifier() { if (_identifierCharSet == null) { _identifierCharSet = new CharacterSet (); _identifierCharSet.AddRange ('A', 'Z'); _identifierCharSet.AddRange ('a', 'z'); _identifierCharSet.AddRange ('0', '9'); _identifierCharSet.Add ('_'); } // Parse remaining characters (if any) var name = ParseCharactersFromCharSet (_identifierCharSet); if (name == null) return null; // Reject if it's just a number bool isNumberCharsOnly = true; foreach (var c in name) { if ( !(c >= '0' && c <= '9') ) { isNumberCharsOnly = false; break; } } if (isNumberCharsOnly) { return null; } return name; }
private Recognizer CreateHeadingNumberRecognizer(System.Globalization.CultureInfo culture, int priority) { try { // TODO use culture's digits but don't use Regex \d placeholder as it's not culture sensitive // TODO set context (word boundaries) // TODO make IP address an alphanumeric token with placeable features? // TODO treat all alphanumeric tokens as placeables? string pattern = "[0-9]+(\\.([0-9]{1,3}))+"; CharacterSet first = new CharacterSet(); first.Add('0', '9'); RegexRecognizer recog = new RegexRecognizer(TokenType.OtherTextPlaceable, priority, "GENHNUMPLC", "GENERIC_HEADINGNUMBER_RECOGNIZER", true); recog.Add(pattern, first); // TODO is this culture-dependent? recog.OnlyIfFollowedByNonwordCharacter = true; return(recog); } catch // (System.Exception e) { return(null); } }
public SortHelper(TextBox _textBox) { textBox = _textBox; textBoxCache = textBox.Text; if (WordSeperators == null) { // Initialize static member once WordSeperators = new CharacterSet(); WordSeperators.Add(UnicodeCategory.SpaceSeparator); WordSeperators.Add('.'); WordSeperators.Add('\\'); } if (Uppercase == null) { Uppercase = new CharacterSet(); Uppercase.Add(UnicodeCategory.UppercaseLetter); } scoreCache = new Dictionary <String, float>(); }
private RegexRecognizer CreateUriRecognizer(System.Globalization.CultureInfo actualCulture, int priority) { // TODO this shouldn't be a recognizer but rather a simple classifier after the // default fallback recognizer (for performance reasons) // TODO set context restrictions of the recognizer RegexRecognizer recog = new RegexRecognizer(TokenType.Uri, priority, "URI", "DEFAULT_URI_REGOCNIZER"); CharacterSet first = new CharacterSet(); // http, https, mailto, ftp, file first.Add('h'); first.Add('H'); first.Add('m'); first.Add('M'); first.Add('f'); first.Add('F'); recog.Add("(mailto:|((https|http|ftp|file)://))[\\p{L}\\p{N}\\p{Pc}\\p{Pd}\\p{Po}\\p{S}-['\"<>]]*[\\p{L}\\p{N}\\p{Pc}\\p{Pd}\\p{S}/]", first, true); // not sure about this one: recog.OnlyIfFollowedByNonwordCharacter = Core.CultureInfoExtensions.UseBlankAsWordSeparator(actualCulture); return(recog); }
private static List <string> ComputeRXPatterns(string grpSep, string decSep, IList <string> digits, out CharacterSet first) { first = null; if (decSep == null || decSep.Length != 1 || grpSep == null || grpSep.Length != 1 || digits == null || digits.Count != 10) { return(null); } if (digits.Any(s => s.Length != 1)) { return(null); } // TODO escape digit's special symbols (unlikely) string dig = System.String.Format("[{0}-{1}]", digits[0], digits[9]); // haven't yet seen other signs in any of the supported cultures string optSign = "(?<sign>[-+\x2013\x2212\xFF0B\xFF0D])?"; // FIRST: all digits plus the signs first = new CharacterSet(); first.Add(digits[0][0], digits[9][0]); first.Add('-'); first.Add('+'); first.Add('\x2013'); first.Add('\x2212'); first.Add('\xFF0B'); first.Add('\xFF0D'); // ---------- digit sequence (preceding decimal point) string simpleDecPart = System.String.Format("(?<sdec>{0}+)", dig); // TODO non-breaking space as grp sep string groupedDecPart = System.String.Format("(?<gdec>{0}{{1,3}}(\\{1}{2}{{3}})*)", dig, grpSep, dig); // ---------- fractional part // do NOT recognize .25 (no optional decimal part) string optFracPart = System.String.Format("(?<frac>\\{0}{1}+)?", decSep, dig); List <string> result = new List <string>(); result.Add(optSign + simpleDecPart + optFracPart); result.Add(optSign + groupedDecPart + optFracPart); return(result); }
internal CharacterSet ReadCharacterSet() { ushort index = ReadUInt16(); //'Unicode Plane'?; skip ReadUInt16(); ushort rangeCount = ReadUInt16(); //Reserved; skip ReadEntry(); CharacterSet charSet = new CharacterSet(index, rangeCount); while (!IsRecordComplete()) { charSet.Add(ReadCharacterRange()); } return(charSet); }
private RegexRecognizer CreateAcronymRecognizer(System.Globalization.CultureInfo actualCulture, int priority) { // TODO this shouldn't be a recognizer but rather a simple classifier after the // default fallback recognizer (for performance reasons) // TODO set context restrictions of the recognizer // TODO acronyms shouldn't be recognized in an all-caps context. // TODO we may also include some additional special symbols such as "_" "." RegexRecognizer recog = new RegexRecognizer(TokenType.Acronym, priority, "ACR", "DEFAULT_ACR_REGOCNIZER"); CharacterSet first = new CharacterSet(); first.Add(System.Globalization.UnicodeCategory.UppercaseLetter); // TODO doesn't catch e.g. OePNV, BfA, APIs // recog.Add("[A-Z][A-Z&]{0,2}[A-Z]{1,3}", first); recog.Add(@"\p{Lu}[\p{Lu}&]{0,4}\p{Lu}", first); recog.OnlyIfFollowedByNonwordCharacter = Core.CultureInfoExtensions.UseBlankAsWordSeparator(actualCulture); return(recog); }
/// <summary> /// Initialize the tokenizer parameters from the tokenizer setup information. /// The resource data accessor is only used for retrieving variable values. /// </summary> /// <param name="setup">The tokenizer setup to use</param> /// <param name="accessor">A resource data accessor, to initialize the variables list</param> public TokenizerParameters(TokenizerSetup setup, Core.Resources.IResourceDataAccessor accessor) { if (setup == null) { throw new ArgumentNullException("setup"); } if (accessor == null) { accessor = new ResourceFileResourceAccessor(); } _BreakOnWhitespace = setup.BreakOnWhitespace; _CreateWhitespaceTokens = setup.CreateWhitespaceTokens; _Culture = Core.CultureInfoExtensions.GetCultureInfo(setup.CultureName); _Recognizers = new List <Recognizer>(); _ReclassifyAcronyms = false; // we need to determine a region-qualified culture since neutral cultures // don't have date/time/number pattern info System.Globalization.CultureInfo actualCulture = _Culture; if (_Culture.IsNeutralCulture) { actualCulture = Core.CultureInfoExtensions.GetRegionQualifiedCulture(_Culture); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeDates) != 0) { AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture, DateTimePatternType.ShortDate | DateTimePatternType.LongDate, 100)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeTimes) != 0) { AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture, DateTimePatternType.ShortTime | DateTimePatternType.LongTime, 100)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeNumbers) != 0) { if (UseRXNumberRecognizer) { AddRecognizer(NumberRegexRecognizer.Create(actualCulture, 100)); } else { AddRecognizer(NumberFSTRecognizer.Create(accessor, actualCulture, 100)); } // it does not make sense to recognize ordinal numbers if they don't become placeables and // don't participate in auto-localization. They'd also need to be auto-localized (3. -> 3rd) bool createOrdinalNumberRecognizer = false; if (createOrdinalNumberRecognizer) { // add special recognizer for ordinal numbers if ordinal followers are available // [0-9]+\. (?=[OrdinalFollowers]) if (accessor.GetResourceStatus(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true) != Core.Resources.ResourceStatus.NotAvailable) { Wordlist ordinalFollowers = new Wordlist(); CharacterSet dummy; ordinalFollowers.Load(accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true)); if (ordinalFollowers.Count > 0) { string ordinalNumbersRX = "[0-9]+\\.(?=[ \u00A0]" + ordinalFollowers.GetRegularExpression(out dummy) + "\\s)"; RegexRecognizer ordinalNumbersRecognizer = new RegexRecognizer(TokenType.Word, 100, "ORDINALNUMBER", "Ordinal Number Recognizer"); CharacterSet ordinalFirst = new CharacterSet(); ordinalFirst.Add('0', '9'); ordinalNumbersRecognizer.Add(ordinalNumbersRX, ordinalFirst); AddRecognizer(ordinalNumbersRecognizer); } } } } else { // TODO should we still add a rudimentary recognizer for alpha-numerals? } // TODO other recognizer types (for builtin token classes) if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms) != 0) { RegexRecognizer recog = CreateAcronymRecognizer(actualCulture, 100); if (recog != null) { _ReclassifyAcronyms = true; AddRecognizer(recog); } // this shouldn't be in the "acronym" setting but it's too late for a UI change... recog = CreateUriRecognizer(actualCulture, 100); AddRecognizer(recog); // TODO make IP address recognizer optional? AddRecognizer(CreateIPAddressRecognizer(actualCulture, 101)); // AddRecognizer(CreateHeadingNumberRecognizer(actualCulture, 50)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeVariables) != 0) { if (accessor != null) { try { RegexRecognizer recog = CreateVariableRecognizer(accessor, actualCulture); if (recog != null) { AddRecognizer(recog); } } catch // (System.Exception e) { // nop - ignore errors } } } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements) != 0) { Recognizer recog; if (UseRXMeasurementRecognizer) { recog = MeasureRegexRecognizer.Create(actualCulture, 100); } else { recog = MeasureFSTRecognizer.Create(accessor, actualCulture, 100); } AddRecognizer(recog); // disable for the time being due to performance issues //if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true) != Core.Resources.ResourceStatus.NotAvailable) //{ // recog = CreateCurrencyRecognizer(accessor, actualCulture); // AddRecognizer(recog); //} } #if false // TODO NOTE this slows down the performance too much - need to find a better way if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true) != Core.Resources.ResourceStatus.NotAvailable) { // add an abbreviation recognizer Wordlist abbreviations = new Wordlist(); CharacterSet first; abbreviations.Load(accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true)); string abbreviationsRX = abbreviations.GetRegularExpression(out first) + @"(?=\W)"; RegexRecognizer abbreviationsRecognizer = new RegexRecognizer(TokenType.Abbreviation, 101, "ABBREVIATION", "Abbreviation Recognizer"); abbreviationsRecognizer.Add(abbreviationsRX, first); AddRecognizer(abbreviationsRecognizer); } #endif { Recognizer recog; bool split = setup.SeparateClitics && Core.CultureInfoExtensions.UsesClitics(_Culture); recog = CreateDefaultFallbackRecognizer(split, accessor); AddRecognizer(recog); } SortRecognizers(); }
private int BuildDFAState(HashSet <int> nfaList) { NfaState state = new NfaState(); foreach (var num in nfaList) { state.NFAStates.UnionWith(m_nfa[num].NFAClosure); } int index = DFAStateNumber(state); if (index == -1) { index = AddDFAState(state); foreach (var num in state.NFAStates) { foreach (var item in m_nfa[num].AcceptList) { state.AcceptList.Add(item); } } var set2 = new List <int>(); var list = new List <Edge>(); foreach (var num in state.NFAStates) { foreach (var edge in m_nfa[num].Edges) { list.Add(edge); set2.Add(edge.Target); } } if (set2.Count >= 1) { var buildArray = new List <CharacterSet>(set2.Count); foreach (int target in set2) { var build = new CharacterSet(); foreach (var edge in list) { if (edge.Target == target) { build.Add(edge.Characters); } } foreach (var edge in list) { if (edge.Target != target) { build.Subtract(edge.Characters); } } buildArray.Add(build); } CharacterSet build5 = new CharacterSet(); foreach (var edge in list) { build5.Add(edge.Characters); } foreach (var set in buildArray) { build5.Subtract(set); } for (int i = 0; i < set2.Count; ++i) { if (buildArray[i].Count >= 1) { var setList = new HashSet <int> { set2[i] }; state.AddEdge(BuildDFAState(setList), buildArray[i]); } } foreach (var number in build5) { var setList = new HashSet <int>(); foreach (var edge in list) { if (ReferenceEquals(edge.Characters, null)) { continue; } if (edge.Characters.Contains(number)) { setList.Add(edge.Target); } } if (setList.Count > 0) { var build = new CharacterSet(); build.Add(number); state.AddEdge(BuildDFAState(setList), build); } } } } return(index); }
internal CharacterSet ReadCharacterSet() { ushort index = ReadUInt16(); //'Unicode Plane'?; skip ReadUInt16(); ushort rangeCount = ReadUInt16(); //Reserved; skip ReadEntry(); CharacterSet charSet = new CharacterSet(index, rangeCount); while (!IsRecordComplete()) { charSet.Add(ReadCharacterRange()); } return charSet; }
internal static List <string> ComputeRXPatterns(System.Globalization.CultureInfo culture, NumberSeparatorMode separatorMode, out CharacterSet first) { first = null; if (culture == null) { throw new ArgumentNullException("culture"); } if (culture.IsNeutralCulture) { System.Diagnostics.Debug.Assert(false, "Cannot compute number pattern for neutral cultures"); return(null); } IList <string> digits = culture.NumberFormat.NativeDigits; string decSep = culture.NumberFormat.NumberDecimalSeparator; string grpSep = culture.NumberFormat.NumberGroupSeparator; switch (separatorMode) { case NumberSeparatorMode.CultureDefault: // nop break; case NumberSeparatorMode.Swapped: { string tmp = decSep; decSep = grpSep; grpSep = tmp; } break; case NumberSeparatorMode.EnUS: decSep = "."; grpSep = ","; break; default: throw new Exception("Unexpected"); } List <string> result = ComputeRXPatterns(grpSep, decSep, digits, out first); if (result == null) { return(null); } // add full-width variant digits for jp, zh, ko if (NumberPatternComputer.DoAddFullwidthVariantDigits(culture)) { Core.CharacterSet alternateFirst; digits = "\uFF10|\uFF11|\uFF12|\uFF13|\uFF14|\uFF15|\uFF16|\uFF17|\uFF18|\uFF19".Split('|'); List <string> alternate = ComputeRXPatterns(grpSep, decSep, digits, out alternateFirst); if (alternate != null) { result.AddRange(alternate); first.Add(alternateFirst); } } // add patters with default digits if (!CultureInfoExtensions.UsesDefaultDigits(culture)) { Core.CharacterSet alternateFirst; digits = "0|1|2|3|4|5|6|7|8|9".Split('|'); List <string> alternate = ComputeRXPatterns(grpSep, decSep, digits, out alternateFirst); if (alternate != null) { result.AddRange(alternate); first.Add(alternateFirst); } } return(result); }
/// <summary> /// Creates a CharacterSet object, containing english alphabet, numbers, and some standard punctuation /// </summary> public static CharacterSet CreateDefaultCharacterSet( ) { CharacterSet chars = new CharacterSet( ); chars.Add( 'a', 'z' ); chars.Add( 'A', 'Z' ); chars.Add( '0', '9' ); chars.Add( "_.:,'!?£$%^&*()[]{}|~#/" ); return chars; }