string RuntimePath()
        {
            if (_runtimePathCharacterSet == null)
            {
                _runtimePathCharacterSet = new CharacterSet(identifierCharSet);
                _runtimePathCharacterSet.Add('-');  // for c-0, g-0 etc
                _runtimePathCharacterSet.Add('.');
            }

            return(ParseCharactersFromCharSet(_runtimePathCharacterSet));
        }
Exemplo n.º 2
0
        // Note: we allow identifiers that start with a number,
        // but not if they *only* comprise numbers
        protected string Identifier()
        {
            if (_identifierCharSet == null) {
                _identifierCharSet = new CharacterSet ();
                _identifierCharSet.AddRange ('A', 'Z');
                _identifierCharSet.AddRange ('a', 'z');
                _identifierCharSet.AddRange ('0', '9');
                _identifierCharSet.Add ('_');
            }

            // Parse remaining characters (if any)
            var name = ParseCharactersFromCharSet (_identifierCharSet);
            if (name == null)
                return null;

            // Reject if it's just a number
            bool isNumberCharsOnly = true;
            foreach (var c in name) {
                if ( !(c >= '0' && c <= '9') ) {
                    isNumberCharsOnly = false;
                    break;
                }
            }
            if (isNumberCharsOnly) {
                return null;
            }

            return name;
        }
        private Recognizer CreateHeadingNumberRecognizer(System.Globalization.CultureInfo culture, int priority)
        {
            try
            {
                // TODO use culture's digits but don't use Regex \d placeholder as it's not culture sensitive
                // TODO set context (word boundaries)
                // TODO make IP address an alphanumeric token with placeable features?
                // TODO treat all alphanumeric tokens as placeables?
                string       pattern = "[0-9]+(\\.([0-9]{1,3}))+";
                CharacterSet first   = new CharacterSet();
                first.Add('0', '9');

                RegexRecognizer recog = new RegexRecognizer(TokenType.OtherTextPlaceable,
                                                            priority, "GENHNUMPLC", "GENERIC_HEADINGNUMBER_RECOGNIZER", true);
                recog.Add(pattern, first);
                // TODO is this culture-dependent?
                recog.OnlyIfFollowedByNonwordCharacter = true;

                return(recog);
            }
            catch             // (System.Exception e)
            {
                return(null);
            }
        }
Exemplo n.º 4
0
            public SortHelper(TextBox _textBox)
            {
                textBox      = _textBox;
                textBoxCache = textBox.Text;

                if (WordSeperators == null)
                {
                    // Initialize static member once
                    WordSeperators = new CharacterSet();
                    WordSeperators.Add(UnicodeCategory.SpaceSeparator);
                    WordSeperators.Add('.');
                    WordSeperators.Add('\\');
                }
                if (Uppercase == null)
                {
                    Uppercase = new CharacterSet();
                    Uppercase.Add(UnicodeCategory.UppercaseLetter);
                }

                scoreCache = new Dictionary <String, float>();
            }
        private RegexRecognizer CreateUriRecognizer(System.Globalization.CultureInfo actualCulture, int priority)
        {
            // TODO this shouldn't be a recognizer but rather a simple classifier after the
            //  default fallback recognizer (for performance reasons)
            // TODO set context restrictions of the recognizer

            RegexRecognizer recog = new RegexRecognizer(TokenType.Uri, priority, "URI", "DEFAULT_URI_REGOCNIZER");

            CharacterSet first = new CharacterSet();

            // http, https, mailto, ftp, file
            first.Add('h');
            first.Add('H');
            first.Add('m');
            first.Add('M');
            first.Add('f');
            first.Add('F');

            recog.Add("(mailto:|((https|http|ftp|file)://))[\\p{L}\\p{N}\\p{Pc}\\p{Pd}\\p{Po}\\p{S}-['\"<>]]*[\\p{L}\\p{N}\\p{Pc}\\p{Pd}\\p{S}/]", first, true);

            // not sure about this one:
            recog.OnlyIfFollowedByNonwordCharacter
                = Core.CultureInfoExtensions.UseBlankAsWordSeparator(actualCulture);

            return(recog);
        }
Exemplo n.º 6
0
        private static List <string> ComputeRXPatterns(string grpSep, string decSep, IList <string> digits,
                                                       out CharacterSet first)
        {
            first = null;

            if (decSep == null || decSep.Length != 1 ||
                grpSep == null || grpSep.Length != 1 ||
                digits == null || digits.Count != 10)
            {
                return(null);
            }

            if (digits.Any(s => s.Length != 1))
            {
                return(null);
            }

            // TODO escape digit's special symbols (unlikely)
            string dig = System.String.Format("[{0}-{1}]", digits[0],
                                              digits[9]);

            // haven't yet seen other signs in any of the supported cultures
            string optSign = "(?<sign>[-+\x2013\x2212\xFF0B\xFF0D])?";

            // FIRST: all digits plus the signs
            first = new CharacterSet();
            first.Add(digits[0][0], digits[9][0]);
            first.Add('-');
            first.Add('+');
            first.Add('\x2013');
            first.Add('\x2212');
            first.Add('\xFF0B');
            first.Add('\xFF0D');

            // ---------- digit sequence (preceding decimal point)

            string simpleDecPart = System.String.Format("(?<sdec>{0}+)", dig);

            // TODO non-breaking space as grp sep

            string groupedDecPart = System.String.Format("(?<gdec>{0}{{1,3}}(\\{1}{2}{{3}})*)",
                                                         dig, grpSep, dig);

            // ---------- fractional part

            // do NOT recognize .25 (no optional decimal part)
            string optFracPart = System.String.Format("(?<frac>\\{0}{1}+)?", decSep, dig);

            List <string> result = new List <string>();

            result.Add(optSign + simpleDecPart + optFracPart);
            result.Add(optSign + groupedDecPart + optFracPart);

            return(result);
        }
Exemplo n.º 7
0
        internal CharacterSet ReadCharacterSet()
        {
            ushort index = ReadUInt16();

            //'Unicode Plane'?; skip
            ReadUInt16();
            ushort rangeCount = ReadUInt16();

            //Reserved; skip
            ReadEntry();

            CharacterSet charSet = new CharacterSet(index, rangeCount);

            while (!IsRecordComplete())
            {
                charSet.Add(ReadCharacterRange());
            }
            return(charSet);
        }
        private RegexRecognizer CreateAcronymRecognizer(System.Globalization.CultureInfo actualCulture, int priority)
        {
            // TODO this shouldn't be a recognizer but rather a simple classifier after the
            //  default fallback recognizer (for performance reasons)

            // TODO set context restrictions of the recognizer
            // TODO acronyms shouldn't be recognized in an all-caps context.
            // TODO we may also include some additional special symbols such as "_" "."
            RegexRecognizer recog = new RegexRecognizer(TokenType.Acronym, priority, "ACR", "DEFAULT_ACR_REGOCNIZER");

            CharacterSet first = new CharacterSet();

            first.Add(System.Globalization.UnicodeCategory.UppercaseLetter);
            // TODO doesn't catch e.g. OePNV, BfA, APIs
            // recog.Add("[A-Z][A-Z&]{0,2}[A-Z]{1,3}", first);
            recog.Add(@"\p{Lu}[\p{Lu}&]{0,4}\p{Lu}", first);

            recog.OnlyIfFollowedByNonwordCharacter
                = Core.CultureInfoExtensions.UseBlankAsWordSeparator(actualCulture);

            return(recog);
        }
        /// <summary>
        /// Initialize the tokenizer parameters from the tokenizer setup information.
        /// The resource data accessor is only used for retrieving variable values.
        /// </summary>
        /// <param name="setup">The tokenizer setup to use</param>
        /// <param name="accessor">A resource data accessor, to initialize the variables list</param>
        public TokenizerParameters(TokenizerSetup setup, Core.Resources.IResourceDataAccessor accessor)
        {
            if (setup == null)
            {
                throw new ArgumentNullException("setup");
            }

            if (accessor == null)
            {
                accessor = new ResourceFileResourceAccessor();
            }

            _BreakOnWhitespace      = setup.BreakOnWhitespace;
            _CreateWhitespaceTokens = setup.CreateWhitespaceTokens;
            _Culture = Core.CultureInfoExtensions.GetCultureInfo(setup.CultureName);

            _Recognizers        = new List <Recognizer>();
            _ReclassifyAcronyms = false;

            // we need to determine a region-qualified culture since neutral cultures
            //  don't have date/time/number pattern info
            System.Globalization.CultureInfo actualCulture = _Culture;
            if (_Culture.IsNeutralCulture)
            {
                actualCulture = Core.CultureInfoExtensions.GetRegionQualifiedCulture(_Culture);
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeDates) != 0)
            {
                AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture,
                                                        DateTimePatternType.ShortDate | DateTimePatternType.LongDate, 100));
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeTimes) != 0)
            {
                AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture,
                                                        DateTimePatternType.ShortTime | DateTimePatternType.LongTime, 100));
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeNumbers) != 0)
            {
                if (UseRXNumberRecognizer)
                {
                    AddRecognizer(NumberRegexRecognizer.Create(actualCulture, 100));
                }
                else
                {
                    AddRecognizer(NumberFSTRecognizer.Create(accessor, actualCulture, 100));
                }

                // it does not make sense to recognize ordinal numbers if they don't become placeables and
                //  don't participate in auto-localization. They'd also need to be auto-localized (3. -> 3rd)
                bool createOrdinalNumberRecognizer = false;
                if (createOrdinalNumberRecognizer)
                {
                    // add special recognizer for ordinal numbers if ordinal followers are available
                    // [0-9]+\. (?=[OrdinalFollowers])
                    if (accessor.GetResourceStatus(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true) != Core.Resources.ResourceStatus.NotAvailable)
                    {
                        Wordlist     ordinalFollowers = new Wordlist();
                        CharacterSet dummy;
                        ordinalFollowers.Load(accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true));
                        if (ordinalFollowers.Count > 0)
                        {
                            string          ordinalNumbersRX         = "[0-9]+\\.(?=[ \u00A0]" + ordinalFollowers.GetRegularExpression(out dummy) + "\\s)";
                            RegexRecognizer ordinalNumbersRecognizer = new RegexRecognizer(TokenType.Word, 100, "ORDINALNUMBER", "Ordinal Number Recognizer");
                            CharacterSet    ordinalFirst             = new CharacterSet();
                            ordinalFirst.Add('0', '9');
                            ordinalNumbersRecognizer.Add(ordinalNumbersRX, ordinalFirst);
                            AddRecognizer(ordinalNumbersRecognizer);
                        }
                    }
                }
            }
            else
            {
                // TODO should we still add a rudimentary recognizer for alpha-numerals?
            }

            // TODO other recognizer types (for builtin token classes)
            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms) != 0)
            {
                RegexRecognizer recog = CreateAcronymRecognizer(actualCulture, 100);
                if (recog != null)
                {
                    _ReclassifyAcronyms = true;
                    AddRecognizer(recog);
                }

                // this shouldn't be in the "acronym" setting but it's too late for a UI change...
                recog = CreateUriRecognizer(actualCulture, 100);
                AddRecognizer(recog);

                // TODO make IP address recognizer optional?
                AddRecognizer(CreateIPAddressRecognizer(actualCulture, 101));
                // AddRecognizer(CreateHeadingNumberRecognizer(actualCulture, 50));
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeVariables) != 0)
            {
                if (accessor != null)
                {
                    try
                    {
                        RegexRecognizer recog = CreateVariableRecognizer(accessor, actualCulture);
                        if (recog != null)
                        {
                            AddRecognizer(recog);
                        }
                    }
                    catch                     // (System.Exception e)
                    {
                        // nop - ignore errors
                    }
                }
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements) != 0)
            {
                Recognizer recog;

                if (UseRXMeasurementRecognizer)
                {
                    recog = MeasureRegexRecognizer.Create(actualCulture, 100);
                }
                else
                {
                    recog = MeasureFSTRecognizer.Create(accessor, actualCulture, 100);
                }

                AddRecognizer(recog);

                // disable for the time being due to performance issues
                //if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true) != Core.Resources.ResourceStatus.NotAvailable)
                //{
                //    recog = CreateCurrencyRecognizer(accessor, actualCulture);
                //    AddRecognizer(recog);
                //}
            }

#if false
            // TODO NOTE this slows down the performance too much - need to find a better way

            if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true) != Core.Resources.ResourceStatus.NotAvailable)
            {
                // add an abbreviation recognizer
                Wordlist     abbreviations = new Wordlist();
                CharacterSet first;
                abbreviations.Load(accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true));
                string          abbreviationsRX         = abbreviations.GetRegularExpression(out first) + @"(?=\W)";
                RegexRecognizer abbreviationsRecognizer = new RegexRecognizer(TokenType.Abbreviation, 101, "ABBREVIATION", "Abbreviation Recognizer");
                abbreviationsRecognizer.Add(abbreviationsRX, first);
                AddRecognizer(abbreviationsRecognizer);
            }
#endif

            {
                Recognizer recog;

                bool split = setup.SeparateClitics && Core.CultureInfoExtensions.UsesClitics(_Culture);
                recog = CreateDefaultFallbackRecognizer(split, accessor);
                AddRecognizer(recog);
            }

            SortRecognizers();
        }
Exemplo n.º 10
0
        private int BuildDFAState(HashSet <int> nfaList)
        {
            NfaState state = new NfaState();

            foreach (var num in nfaList)
            {
                state.NFAStates.UnionWith(m_nfa[num].NFAClosure);
            }

            int index = DFAStateNumber(state);

            if (index == -1)
            {
                index = AddDFAState(state);
                foreach (var num in state.NFAStates)
                {
                    foreach (var item in m_nfa[num].AcceptList)
                    {
                        state.AcceptList.Add(item);
                    }
                }

                var set2 = new List <int>();
                var list = new List <Edge>();
                foreach (var num in state.NFAStates)
                {
                    foreach (var edge in m_nfa[num].Edges)
                    {
                        list.Add(edge);
                        set2.Add(edge.Target);
                    }
                }

                if (set2.Count >= 1)
                {
                    var buildArray = new List <CharacterSet>(set2.Count);
                    foreach (int target in set2)
                    {
                        var build = new CharacterSet();
                        foreach (var edge in list)
                        {
                            if (edge.Target == target)
                            {
                                build.Add(edge.Characters);
                            }
                        }
                        foreach (var edge in list)
                        {
                            if (edge.Target != target)
                            {
                                build.Subtract(edge.Characters);
                            }
                        }
                        buildArray.Add(build);
                    }

                    CharacterSet build5 = new CharacterSet();
                    foreach (var edge in list)
                    {
                        build5.Add(edge.Characters);
                    }
                    foreach (var set in buildArray)
                    {
                        build5.Subtract(set);
                    }
                    for (int i = 0; i < set2.Count; ++i)
                    {
                        if (buildArray[i].Count >= 1)
                        {
                            var setList = new HashSet <int> {
                                set2[i]
                            };
                            state.AddEdge(BuildDFAState(setList), buildArray[i]);
                        }
                    }
                    foreach (var number in build5)
                    {
                        var setList = new HashSet <int>();
                        foreach (var edge in list)
                        {
                            if (ReferenceEquals(edge.Characters, null))
                            {
                                continue;
                            }

                            if (edge.Characters.Contains(number))
                            {
                                setList.Add(edge.Target);
                            }
                        }
                        if (setList.Count > 0)
                        {
                            var build = new CharacterSet();
                            build.Add(number);
                            state.AddEdge(BuildDFAState(setList), build);
                        }
                    }
                }
            }
            return(index);
        }
        internal CharacterSet ReadCharacterSet()
        {
            ushort index = ReadUInt16();
            //'Unicode Plane'?; skip
            ReadUInt16();
            ushort rangeCount = ReadUInt16();
            //Reserved; skip
            ReadEntry();

            CharacterSet charSet = new CharacterSet(index, rangeCount);
            while (!IsRecordComplete())
            {
                charSet.Add(ReadCharacterRange());
            }
            return charSet;
        }
Exemplo n.º 12
0
        internal static List <string> ComputeRXPatterns(System.Globalization.CultureInfo culture,
                                                        NumberSeparatorMode separatorMode,
                                                        out CharacterSet first)
        {
            first = null;

            if (culture == null)
            {
                throw new ArgumentNullException("culture");
            }

            if (culture.IsNeutralCulture)
            {
                System.Diagnostics.Debug.Assert(false, "Cannot compute number pattern for neutral cultures");
                return(null);
            }

            IList <string> digits = culture.NumberFormat.NativeDigits;

            string decSep = culture.NumberFormat.NumberDecimalSeparator;
            string grpSep = culture.NumberFormat.NumberGroupSeparator;

            switch (separatorMode)
            {
            case NumberSeparatorMode.CultureDefault:
                // nop
                break;

            case NumberSeparatorMode.Swapped:
            {
                string tmp = decSep;
                decSep = grpSep;
                grpSep = tmp;
            }
            break;

            case NumberSeparatorMode.EnUS:
                decSep = ".";
                grpSep = ",";
                break;

            default:
                throw new Exception("Unexpected");
            }

            List <string> result = ComputeRXPatterns(grpSep, decSep, digits, out first);

            if (result == null)
            {
                return(null);
            }

            // add full-width variant digits for jp, zh, ko
            if (NumberPatternComputer.DoAddFullwidthVariantDigits(culture))
            {
                Core.CharacterSet alternateFirst;
                digits = "\uFF10|\uFF11|\uFF12|\uFF13|\uFF14|\uFF15|\uFF16|\uFF17|\uFF18|\uFF19".Split('|');
                List <string> alternate = ComputeRXPatterns(grpSep, decSep, digits, out alternateFirst);
                if (alternate != null)
                {
                    result.AddRange(alternate);
                    first.Add(alternateFirst);
                }
            }

            // add patters with default digits
            if (!CultureInfoExtensions.UsesDefaultDigits(culture))
            {
                Core.CharacterSet alternateFirst;
                digits = "0|1|2|3|4|5|6|7|8|9".Split('|');
                List <string> alternate = ComputeRXPatterns(grpSep, decSep, digits, out alternateFirst);
                if (alternate != null)
                {
                    result.AddRange(alternate);
                    first.Add(alternateFirst);
                }
            }

            return(result);
        }
Exemplo n.º 13
0
 /// <summary>
 /// Creates a CharacterSet object, containing english alphabet, numbers, and some standard punctuation
 /// </summary>
 public static CharacterSet CreateDefaultCharacterSet( )
 {
     CharacterSet chars = new CharacterSet( );
     chars.Add( 'a', 'z' );
     chars.Add( 'A', 'Z' );
     chars.Add( '0', '9' );
     chars.Add( "_.:,'!?£$%^&*()[]{}|~#/" );
     return chars;
 }