Esempio n. 1
0
        private void LoadDocument(LucyDocument model, Analyzer exactAnalyzer, Analyzer fuzzyAnalyzer, Boolean useAllBuiltIns)
        {
            this._lucyModel = model;

            this._exactAnalyzer = exactAnalyzer ?? GetAnalyzerForLocale(model.Locale);

            this._fuzzyAnalyzer = exactAnalyzer ?? fuzzyAnalyzer ??
                                  Analyzer.NewAnonymous((field, textReader) =>
            {
                Tokenizer tokenizer = new StandardTokenizer(LuceneVersion.LUCENE_48, textReader);
                TokenStream stream  = new DoubleMetaphoneFilter(tokenizer, 6, false);
                //TokenStream stream = new BeiderMorseFilterFactory(new Dictionary<string, string>()
                //    {
                //        { "nameType", NameType.GENERIC.ToString()},
                //        { "ruleType", RuleType.APPROX.ToString() },
                //        { "languageSet", "auto"}
                //    }).Create(tokenizer);
                return(new TokenStreamComponents(tokenizer, stream));
            });

            this._patternParser = new PatternParser(this._exactAnalyzer, this._fuzzyAnalyzer);;

            if (_lucyModel.Macros == null)
            {
                _lucyModel.Macros = new Dictionary <string, string>();
            }

            if (_lucyModel.Entities != null)
            {
                foreach (var entityModel in _lucyModel.Entities)
                {
                    if (entityModel.Patterns != null)
                    {
                        foreach (var patternModel in entityModel.Patterns)
                        {
                            var    first      = patternModel.First();
                            string resolution = first.Any(ch => ch == '@' || ch == '|' || ch == '+' || ch == '*' || ch == '?') || first.Contains("___") ? null : first.Trim('~', '(', ')');

                            foreach (var pattern in patternModel.Select(pat => ExpandMacros(pat)).OrderByDescending(pat => pat.Length))
                            {
                                if (pattern.StartsWith('/') && pattern.EndsWith('/'))
                                {
                                    RegexEntityPatterns.Add(new RegexEntityRecognizer(entityModel.Name, pattern.Trim('/')));
                                }
                                else
                                {
                                    var patternMatcher = _patternParser.Parse(pattern, entityModel.FuzzyMatch);
                                    if (patternMatcher != null)
                                    {
                                        var ignoreWords = entityModel.Ignore?.Select(ignoreText => ((TokenResolution)Tokenize(ignoreText).First().Resolution).Token) ?? Array.Empty <string>();

                                        // Trace.TraceInformation($"{expandedPattern} => {patternMatcher}");
                                        if (patternMatcher.ContainsWildcard())
                                        {
                                            // we want to process wildcard patterns last
                                            WildcardEntityPatterns.Add(new EntityPattern(entityModel.Name, resolution, patternMatcher, ignoreWords));
                                        }
                                        else
                                        {
                                            EntityPatterns.Add(new EntityPattern(entityModel.Name, resolution, patternMatcher, ignoreWords));
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                // Auto detect all references to built in entities
                foreach (var pattern in this.EntityPatterns.ToList())
                {
                    foreach (var reference in pattern.PatternMatcher.GetEntityReferences().Select(r => r.TrimStart('@')))
                    {
                        if (reference == "datetime" || reference == "datetimeV2")
                        {
                            this.BuiltinEntities.Add("datetime");

                            // add default pattern for datetime = (all permutations of datetime)
                            EntityPatterns.Add(new EntityPattern("datetime", _patternParser.Parse("(@datetimeV2.date|@datetimeV2.time|@datetimeV2.datetime|@datetimeV2.daterange|@datetimeV2.timerange|@datetimeV2.datetimerange|@datetimeV2.duration)")));
                        }

                        if (builtinEntities.Contains(reference) ||
                            builtinEntities.Contains(reference.Split('.').First()))
                        {
                            this.BuiltinEntities.Add(reference);
                        }
                    }
                }

                if (model.ExternalEntities != null)
                {
                    foreach (var externalEntity in model.ExternalEntities)
                    {
                        if (builtinEntities.Contains(externalEntity) ||
                            builtinEntities.Contains(externalEntity.Split('.').First()))
                        {
                            this.BuiltinEntities.Add(externalEntity);
                        }
                    }
                }
            }

            if (useAllBuiltIns)
            {
                BuiltinEntities = new HashSet <string>(builtinEntities);
                // add default pattern for datetime = (all permutations of datetime)
                EntityPatterns.Add(new EntityPattern("datetime", _patternParser.Parse("(@datetimeV2.date|@datetimeV2.time|@datetimeV2.datetime|@datetimeV2.daterange|@datetimeV2.timerange|@datetimeV2.datetimerange|@datetimeV2.duration)")));
            }

            ValidateModel();
        }
Esempio n. 2
0
        /// <summary>
        /// Match entities in given text
        /// </summary>
        /// <param name="text">text to match against.</param>
        /// <param name="culture">culture</param>
        /// <param name="externalEntities">externally provided entities</param>
        /// <param name="includeInternal">include tokens in results</param>
        /// <returns>entities</returns>
        public IList <LucyEntity> MatchEntities(string text, IEnumerable <LucyEntity> externalEntities = null, bool includeInternal = false)
        {
            var context = new MatchContext()
            {
                Text = text,
            };

            if (externalEntities != null)
            {
                foreach (var externalEntity in externalEntities)
                {
                    context.AddNewEntity(externalEntity);
                }
                context.ProcessNewEntities();
            }

            if (this.BuiltinEntities.Any())
            {
                AddBuiltInEntities(context, text, Locale);
                context.ProcessNewEntities();
            }

            // Add regex pattern entities
            if (this.RegexEntityPatterns.Any())
            {
                foreach (var regex in this.RegexEntityPatterns)
                {
                    foreach (var entity in regex.Matches(text))
                    {
                        context.NewEntities.Add(entity);
                    }
                }
                context.ProcessNewEntities();
            }

            // add all @Token entities
            context.TokenEntities.AddRange(Tokenize(text));

            int count = 0;

            do
            {
                count = context.Entities.Count;

                // foreach text token
                foreach (var tokenEntity in context.TokenEntities)
                {
                    // foreach entity pattern
                    foreach (var entityPattern in EntityPatterns)
                    {
                        ProcessEntityPattern(context, tokenEntity, entityPattern);
                    }
                }

                context.ProcessNewEntities();

                // if entities were added we need to run wildcard matchers
                if (count == context.Entities.Count && WildcardEntityPatterns.Any())
                {
                    // process wildcard patterns
                    foreach (var textEntity in context.TokenEntities)
                    {
                        foreach (var entityPattern in WildcardEntityPatterns)
                        {
                            ProcessEntityPattern(context, textEntity, entityPattern);
                        }
                    }
                    context.ProcessNewEntities();
                }
            } while (count != context.Entities.Count);

            context.MergeEntities(context.Entities);
            context.ResolveEntities(context.Entities);

            // only include tokenEntities if they ask for them
            if (includeInternal)
            {
                var merged = new List <LucyEntity>(context.TokenEntities);
                merged.AddRange(context.Entities);
                return(merged);
            }

            return(context.Entities.OrderByDescending(e => e.Score).ToList());
        }