public DateTimePattern(Core.Tokenization.DateTimePatternType patternType, System.Globalization.CultureInfo culture,
                        string formatString, LanguagePlatform.Lingua.FST.FST fst)
     : base(fst, culture)
 {
     _PatternType  = patternType;
     _FormatString = formatString;
 }
Esempio n. 2
0
        public FSTRecognizer(LanguagePlatform.Lingua.FST.FST fst,
                             System.Globalization.CultureInfo culture)
        {
            if (fst == null)
            {
                throw new ArgumentNullException("fst");
            }
            if (culture == null)
            {
                culture = System.Globalization.CultureInfo.InvariantCulture;
            }

            _FST     = fst;
            _Culture = culture;
            _First   = _FST.GetFirstSet(false);
        }
Esempio n. 3
0
        /// <summary>
        /// Attempts to get the compiled FST from the resources, and if that fails, will create it from scratch
        /// </summary>
        public MeasureFSTRecognizer(System.Globalization.CultureInfo culture, int priority,
                                    Core.Resources.IResourceDataAccessor accessor)
            : base(TokenType.Measurement, priority, "Measurement", "MeasureFSTRecognizer")
        {
            if (culture == null)
            {
                throw new ArgumentNullException();
            }
            if (culture.IsNeutralCulture)
            {
                throw new ArgumentException("Cannot compute measurement patterns for neutral cultures");
            }
            if (culture.NumberFormat == null)
            {
                throw new ArgumentException("No number format info available for the specified culture");
            }
            if (accessor == null)
            {
                accessor = new ResourceFileResourceAccessor();
            }

            LanguagePlatform.Lingua.FST.FST fst = null;

            bool attemptLoad = true;

            if (attemptLoad &&
                accessor.GetResourceStatus(culture, Core.Resources.LanguageResourceType.MeasurementFST, true) !=
                Core.Resources.ResourceStatus.NotAvailable)
            {
                // TODO should _Culture be set to the _actual_ culture of the loaded FST, i.e.
                //  the invariant culture for the generic/canonical one?

                byte[] data = accessor.GetResourceData(culture, Core.Resources.LanguageResourceType.MeasurementFST, true);
                if (data == null)
                {
                    throw new Core.LanguagePlatformException(Core.ErrorCode.ResourceNotAvailable);
                }

                fst = LanguagePlatform.Lingua.FST.FST.Create(data);
            }
            else
            {
                fst = CreateFST(culture, Core.CultureInfoExtensions.UseBlankAsWordSeparator(culture));
            }

            _FSTRecognizer = new FSTRecognizer(fst, culture);
        }
        public NumberFSTRecognizer(System.Globalization.CultureInfo culture, int priority)
            : base(TokenType.Number, priority, "Number", "NumberFSTRecognizer")
        {
            if (culture == null)
            {
                throw new ArgumentNullException();
            }
            if (culture.IsNeutralCulture)
            {
                throw new ArgumentException("Cannot compute number patterns for neutral cultures");
            }
            if (culture.NumberFormat == null)
            {
                throw new ArgumentException("No number format info available for the specified culture");
            }

            LanguagePlatform.Lingua.FST.FST fst = CreateFST(culture, Core.CultureInfoExtensions.UseBlankAsWordSeparator(culture));
            _FSTRecognizer = new FSTRecognizer(fst, culture);
        }
        internal static LanguagePlatform.Lingua.FST.FST CreateFST(System.Globalization.CultureInfo culture,
                                                                  bool appendWordTerminator)
        {
            NumberFormatData nfd
                = NumberPatternComputer.GetNumberFormatData(culture, true, true);

            string fstPattern = Lingua.Tokenization.NumberPatternComputer.ComputeFSTPattern(nfd,
                                                                                            true, appendWordTerminator);

            LanguagePlatform.Lingua.FST.FST fst = LanguagePlatform.Lingua.FST.FST.Create(fstPattern);

            fst.MakeDeterministic();

#if DEBUG
            bool dump = false;
            if (dump)
            {
                fst.Dump(String.Format("d:/temp/number-fst-{0}.txt", culture.Name));
            }
#endif

            return(fst);
        }
Esempio n. 6
0
        internal static LanguagePlatform.Lingua.FST.FST CreateFST(System.Globalization.CultureInfo culture,
                                                                  bool appendWordTerminator)
        {
            NumberFormatData nfd
                = NumberPatternComputer.GetNumberFormatData(culture, true, true);

            string numberPattern = Lingua.Tokenization.NumberPatternComputer.ComputeFSTPattern(nfd,
                                                                                               true, false);

            System.Text.StringBuilder sb = new StringBuilder(numberPattern);
            sb.Append("(");
            bool first = true;

            NumberPatternComputer.AppendDisjunction(sb, Core.CharacterProperties.Blanks, 'U', ref first);
            sb.Append(")?(");

            first = true;
            Core.Wordlist units = Core.Tokenization.PhysicalUnit.GetUnits(culture, false);
            foreach (string unit in units.Items)
            {
                if (first)
                {
                    first = false;
                }
                else
                {
                    sb.Append("|");
                }

                // append single unit, make sure that first char emits 'U' (in case no whitespace
                //  sep is in the input)
                sb.AppendFormat("(<{0}:U>", FST.FST.EscapeSpecial(unit[0]));
                string remainder = unit.Substring(1);
                if (!String.IsNullOrEmpty(remainder))
                {
                    sb.Append(FST.FST.EscapeSpecial(remainder));
                }
                sb.Append(")");
            }

            sb.Append(")");

            if (appendWordTerminator)
            {
                // Append "word terminator"
                sb.Append("#>");
            }

            LanguagePlatform.Lingua.FST.FST fst = LanguagePlatform.Lingua.FST.FST.Create(sb.ToString());

            fst.MakeDeterministic();

#if DEBUG
            bool dump = false;
            if (dump)
            {
                fst.Dump(String.Format("d:/temp/measure-fst-{0}.txt", culture.Name));
            }
#endif

            return(fst);
        }
Esempio n. 7
0
        /// <summary>
        /// Attempts to load the FSTs from the resource accessor.
        /// </summary>
        private List <CalendarDateTimePatterns> LoadPatterns(Core.Tokenization.DateTimePatternType types, bool allCalendars)
        {
            if (_Accessor == null)
            {
                return(null);
            }

            DateTimePatternType[] iter = new DateTimePatternType[]
            {
                DateTimePatternType.LongDate,
                DateTimePatternType.ShortDate,
                DateTimePatternType.LongTime,
                DateTimePatternType.ShortTime
            };

            List <CalendarDateTimePatterns> result = null;
            // we currently only support one culture pattern
            CalendarDateTimePatterns thePattern = null;

            foreach (DateTimePatternType t in iter)
            {
                if ((types & t) == 0)
                {
                    continue;
                }

                Core.Resources.LanguageResourceType rt = Core.Resources.LanguageResourceType.Undefined;

                switch (t)
                {
                case DateTimePatternType.LongDate:
                    rt = Core.Resources.LanguageResourceType.LongDateFST;
                    break;

                case DateTimePatternType.ShortDate:
                    rt = Core.Resources.LanguageResourceType.ShortDateFST;
                    break;

                case DateTimePatternType.ShortTime:
                    rt = Core.Resources.LanguageResourceType.ShortTimeFST;
                    break;

                case DateTimePatternType.LongTime:
                    rt = Core.Resources.LanguageResourceType.LongTimeFST;
                    break;

                default:
                    throw new Exception("Cannot map token type to corresponding resource type");
                }

                if (_Accessor.GetResourceStatus(_Culture, rt, false) != Core.Resources.ResourceStatus.NotAvailable)
                {
                    byte[] data = _Accessor.GetResourceData(_Culture, rt, false);
                    LanguagePlatform.Lingua.FST.FST fst = LanguagePlatform.Lingua.FST.FST.Create(data);

                    if (thePattern == null)
                    {
                        // TODO support the case where some (not all) FSTs are loaded from the resources
                        result = new List <CalendarDateTimePatterns>();
                        // TODO support all calendars
                        thePattern = new CalendarDateTimePatterns(_Culture, null);
                        result.Add(thePattern);
                    }

                    // TODO compute FIRST() for the FST at load time (or persistently store it?)
                    thePattern.Patterns.Add(new DateTimePattern(t, _Culture, "(unavailable)", fst));
                }
            }

            return(result);
        }
Esempio n. 8
0
        private CalendarDateTimePatterns ComputeSinglePattern(Core.Tokenization.DateTimePatternType types, System.Globalization.Calendar cal)
        {
            CalendarDateTimePatterns result;
            List <string>            probePatterns;

            // TODO this doesn't yet work with alternate calendars

            if (cal == null)
            {
                result = new CalendarDateTimePatterns(_Culture, _Culture.Calendar);

                probePatterns = new List <string>(_Culture.DateTimeFormat.GetAllDateTimePatterns());
            }
            else
            {
                result = new CalendarDateTimePatterns(_Culture, cal);

                System.Globalization.DateTimeFormatInfo tmp = (System.Globalization.DateTimeFormatInfo)_Culture.DateTimeFormat.Clone();
                tmp.Calendar = cal;

                probePatterns = new List <string>(tmp.GetAllDateTimePatterns());
            }

            // manually augment list of date/time patterns for some languages
            List <string> customPatters = GetCustomPatterns(_Culture);

            if (customPatters != null)
            {
                probePatterns.AddRange(customPatters);
            }

            List <string> patterns = new List <string>();

            // TODO

            /*
             * The current approach computes a transducer which will emit a canonicalized
             * representation of the token value which will later be parsed during
             * tokenization. Alternatively, we could directly emit the parse pattern and
             * skip the canonicalization.
             * */

            foreach (string p in probePatterns)
            {
                if (patterns.Contains(p) || IgnorePattern(_Culture, p))
                {
                    continue;
                }

                patterns.Add(p);

                string rx;
                DateTimePatternType patternType = ClassifyFormatString(p, out rx);
                if (patternType == DateTimePatternType.Unknown)
                {
                    continue;
                }

                if ((types & patternType) == 0)
                {
                    continue;
                }

                // TODO support addWordBoundary
#if DEBUG
                bool log = false;
                if (log)
                {
                    using (System.IO.StreamWriter output = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + @"\builder.log", true, System.Text.Encoding.UTF8))
                    {
                        output.WriteLine("{0}:\r\nPattern: {1}\r\nExpression: {2}\r\n",
                                         _Culture.Name, p, rx);
                    }
                }
#endif

                LanguagePlatform.Lingua.FST.FST f = LanguagePlatform.Lingua.FST.FST.Create(rx);
                f.MakeDeterministic();
                result.Patterns.Add(new DateTimePattern(patternType, _Culture, p, f));
            }

            if (result.Patterns.Count > 0)
            {
                // combine FSTs of each pattern type into a single automaton

                result.Patterns.Sort((a, b) => (int)a.PatternType - (int)b.PatternType);

                List <DateTimePattern> combined = new List <DateTimePattern>();
                while (result.Patterns.Count > 0)
                {
                    int first             = 0;
                    DateTimePatternType t = result.Patterns[0].PatternType;
                    int last = first + 1;                     // intentionally starting at next pattern

                    List <FST.FST> alternatives = new List <Sdl.LanguagePlatform.Lingua.FST.FST>();

                    while (last < result.Patterns.Count && result.Patterns[last].PatternType == t)
                    {
                        alternatives.Add(result.Patterns[last].FST);
                        ++last;
                    }

                    result.Patterns[0].FST.Disjunct(alternatives);
                    result.Patterns[0].FST.MakeDeterministic();

                    combined.Add(result.Patterns[0]);
                    result.Patterns.RemoveRange(0, last - first);
                }

                result.Patterns.AddRange(combined);
            }

            return(result);
        }