コード例 #1
0
        private void LoadRegexCache()
        {
            const string weekdaysRegex    = @"(Sun|Mon|Tue|Wed|Thu|Fri|Sat)";
            const string monthDayRegex    = @"((0?[1-9]|[1-2][0-9]|3[01])(st|nd|rd|th)?)";
            var          wordedMonthRegex =
                "((Jan(uary)?)|(Feb(ruary)?)|(Mar(ch)?)|(Apr(il)?)|May|(Jun(e)?)|(Jul(y)?)|(Aug(ust)?)|(Sep(tember)?)|(Oct(ober?))|(Nov(ember)?)|(Dec(ember)?))";

            wordedMonthRegex = $"({wordedMonthRegex}|{wordedMonthRegex.ToUpperInvariant()})";
            const string numberedMonthRegex = @"((0?[1-9]|1[0-2]))";
            const string yearRegex          = "(19[0-9]{2}|[2-9][0-9]{3}|[0-9]{2})";
            const string timeRegex          = @"(\s+(2[0-3]|[0-1]?[0-9]):([0-5][0-9])(:(60|[0-5][0-9]))?)";
            const string timezoneRegex      = @"(([-\\+][0-9]{2}[0-5][0-9]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z])))";

            var dateTimeFormats = new[]
            {
                $@"\s*{weekdaysRegex}?{monthDayRegex}[.\\/-]\s*{numberedMonthRegex}[.\\/-]\s*{yearRegex}{timeRegex}?{timezoneRegex}?",
                $@"\s*{weekdaysRegex}?{monthDayRegex}[ .\\/-]{wordedMonthRegex}[ .\\/-]{yearRegex}{timeRegex}?{timezoneRegex}?",
                $@"\s*{weekdaysRegex}?{numberedMonthRegex}[.\\/-]{yearRegex}[ .\\/-]{timeRegex}?{timezoneRegex}?",
                $@"\s*{weekdaysRegex}?{wordedMonthRegex}[ .\\/-]{monthDayRegex}[ .\\/-]{timeRegex}?{timezoneRegex}?\s+{yearRegex}",
                $@"\s*{wordedMonthRegex}\s*{yearRegex}([ .\\/-]{timeRegex}{timezoneRegex}?)?"
            };

            var streetAbbreviations = new[]
            {
                "Ave",
                "Blvd",
                "Bdwy",
                "Cir",
                "Cl",
                "Ct",
                "Cr",
                "Dr",
                "Gdn",
                "Gdns",
                "Gn",
                "Gr",
                "Ln",
                "Mt",
                "Pl",
                "Pk",
                "Rdg",
                "Rd",
                "Sq",
                "St",
                "Ter",
                "Val"
            };

            var streetTypeRegexes = streetAbbreviations.Select(i =>
            {
                var regex = new StringBuilder("(\\s");
                foreach (var c in i)
                {
                    regex.Append($"[{c.ToString().ToUpperInvariant()}{c.ToString().ToLowerInvariant()}]");
                }

                regex.Append(".)");
                return(regex.ToString());
            }).ToArray();

            // date/times
            RegexCache.Add(new Regex(string.Join("|", dateTimeFormats), RegexOptions.Compiled));
            // phone numbers
            RegexCache.Add(new Regex(@"(([0]|((\+|00)[0-9]{1-3}))[0-9][0-9][0-9]\s*[0-9]\s*[0-9][0-9]\s*[0-9]\s*[0-9][0-9][0-9])",
                                     RegexOptions.Compiled));
            // nhs numbers
            RegexCache.Add(new Regex(@"([0-9][0-9][0-9][ -]?[0-9][0-9][0-9][ -]?[0-9][0-9][0-9][0-9])", RegexOptions.Compiled));
            RegexCache.Add(new Regex(@"([0-9][0-9][0-9][ -][0-9][0-9][0-9][0-9][ -][0-9][0-9][0-9])", RegexOptions.Compiled));
            // email addresses
            RegexCache.Add(new Regex(
                               @"((?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|""(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*"")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\]))",
                               RegexOptions.Compiled));
            // post codes
            RegexCache.Add(new Regex(
                               $@"(([,\sa-zA-Z0-9]|{string.Join("|", streetTypeRegexes)})*[,]\s)?(([gG][iI][rR] {{0,}}0[aA]{{2}})|((([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y]?[0-9][0-9]?)|(([a-pr-uwyzA-PR-UWYZ][0-9][a-hjkstuwA-HJKSTUW])|([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y][0-9O][abehmnprv-yABEHMNPRV-Y])))\s*[0-9O][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{{2}}))",
                               RegexOptions.Compiled));
        }