private void LoadDocument(LucyDocument model, Analyzer exactAnalyzer, Analyzer fuzzyAnalyzer, Boolean useAllBuiltIns) { this._lucyModel = model; this._exactAnalyzer = exactAnalyzer ?? GetAnalyzerForLocale(model.Locale); this._fuzzyAnalyzer = exactAnalyzer ?? fuzzyAnalyzer ?? Analyzer.NewAnonymous((field, textReader) => { Tokenizer tokenizer = new StandardTokenizer(LuceneVersion.LUCENE_48, textReader); TokenStream stream = new DoubleMetaphoneFilter(tokenizer, 6, false); //TokenStream stream = new BeiderMorseFilterFactory(new Dictionary<string, string>() // { // { "nameType", NameType.GENERIC.ToString()}, // { "ruleType", RuleType.APPROX.ToString() }, // { "languageSet", "auto"} // }).Create(tokenizer); return(new TokenStreamComponents(tokenizer, stream)); }); this._patternParser = new PatternParser(this._exactAnalyzer, this._fuzzyAnalyzer);; if (_lucyModel.Macros == null) { _lucyModel.Macros = new Dictionary <string, string>(); } if (_lucyModel.Entities != null) { foreach (var entityModel in _lucyModel.Entities) { if (entityModel.Patterns != null) { foreach (var patternModel in entityModel.Patterns) { var first = patternModel.First(); string resolution = first.Any(ch => ch == '@' || ch == '|' || ch == '+' || ch == '*' || ch == '?') || first.Contains("___") ? null : first.Trim('~', '(', ')'); foreach (var pattern in patternModel.Select(pat => ExpandMacros(pat)).OrderByDescending(pat => pat.Length)) { if (pattern.StartsWith('/') && pattern.EndsWith('/')) { RegexEntityPatterns.Add(new RegexEntityRecognizer(entityModel.Name, pattern.Trim('/'))); } else { var patternMatcher = _patternParser.Parse(pattern, entityModel.FuzzyMatch); if (patternMatcher != null) { var ignoreWords = entityModel.Ignore?.Select(ignoreText => ((TokenResolution)Tokenize(ignoreText).First().Resolution).Token) ?? Array.Empty <string>(); // Trace.TraceInformation($"{expandedPattern} => {patternMatcher}"); if (patternMatcher.ContainsWildcard()) { // we want to process wildcard patterns last WildcardEntityPatterns.Add(new EntityPattern(entityModel.Name, resolution, patternMatcher, ignoreWords)); } else { EntityPatterns.Add(new EntityPattern(entityModel.Name, resolution, patternMatcher, ignoreWords)); } } } } } } } // Auto detect all references to built in entities foreach (var pattern in this.EntityPatterns.ToList()) { foreach (var reference in pattern.PatternMatcher.GetEntityReferences().Select(r => r.TrimStart('@'))) { if (reference == "datetime" || reference == "datetimeV2") { this.BuiltinEntities.Add("datetime"); // add default pattern for datetime = (all permutations of datetime) EntityPatterns.Add(new EntityPattern("datetime", _patternParser.Parse("(@datetimeV2.date|@datetimeV2.time|@datetimeV2.datetime|@datetimeV2.daterange|@datetimeV2.timerange|@datetimeV2.datetimerange|@datetimeV2.duration)"))); } if (builtinEntities.Contains(reference) || builtinEntities.Contains(reference.Split('.').First())) { this.BuiltinEntities.Add(reference); } } } if (model.ExternalEntities != null) { foreach (var externalEntity in model.ExternalEntities) { if (builtinEntities.Contains(externalEntity) || builtinEntities.Contains(externalEntity.Split('.').First())) { this.BuiltinEntities.Add(externalEntity); } } } } if (useAllBuiltIns) { BuiltinEntities = new HashSet <string>(builtinEntities); // add default pattern for datetime = (all permutations of datetime) EntityPatterns.Add(new EntityPattern("datetime", _patternParser.Parse("(@datetimeV2.date|@datetimeV2.time|@datetimeV2.datetime|@datetimeV2.daterange|@datetimeV2.timerange|@datetimeV2.datetimerange|@datetimeV2.duration)"))); } ValidateModel(); }
/// <summary> /// Match entities in given text /// </summary> /// <param name="text">text to match against.</param> /// <param name="culture">culture</param> /// <param name="externalEntities">externally provided entities</param> /// <param name="includeInternal">include tokens in results</param> /// <returns>entities</returns> public IList <LucyEntity> MatchEntities(string text, IEnumerable <LucyEntity> externalEntities = null, bool includeInternal = false) { var context = new MatchContext() { Text = text, }; if (externalEntities != null) { foreach (var externalEntity in externalEntities) { context.AddNewEntity(externalEntity); } context.ProcessNewEntities(); } if (this.BuiltinEntities.Any()) { AddBuiltInEntities(context, text, Locale); context.ProcessNewEntities(); } // Add regex pattern entities if (this.RegexEntityPatterns.Any()) { foreach (var regex in this.RegexEntityPatterns) { foreach (var entity in regex.Matches(text)) { context.NewEntities.Add(entity); } } context.ProcessNewEntities(); } // add all @Token entities context.TokenEntities.AddRange(Tokenize(text)); int count = 0; do { count = context.Entities.Count; // foreach text token foreach (var tokenEntity in context.TokenEntities) { // foreach entity pattern foreach (var entityPattern in EntityPatterns) { ProcessEntityPattern(context, tokenEntity, entityPattern); } } context.ProcessNewEntities(); // if entities were added we need to run wildcard matchers if (count == context.Entities.Count && WildcardEntityPatterns.Any()) { // process wildcard patterns foreach (var textEntity in context.TokenEntities) { foreach (var entityPattern in WildcardEntityPatterns) { ProcessEntityPattern(context, textEntity, entityPattern); } } context.ProcessNewEntities(); } } while (count != context.Entities.Count); context.MergeEntities(context.Entities); context.ResolveEntities(context.Entities); // only include tokenEntities if they ask for them if (includeInternal) { var merged = new List <LucyEntity>(context.TokenEntities); merged.AddRange(context.Entities); return(merged); } return(context.Entities.OrderByDescending(e => e.Score).ToList()); }