public AbbreviationCapturer(Language language) { Language = language; var commonWords = AbbreviationCapturerCommonWords.Get(language); DiscardCommonWords = new PatternUnit(PatternUnitPrototype.Single().WithTokens(commonWords, ignoreCase: true)); Stopwords = new HashSet <ulong>(StopWords.Spacy.For(Language).Select(w => w.AsSpan().IgnoreCaseHash64()).ToArray()); }
public bool IsMatch(Span <Token> tokens, out int consumedTokens) { int largestMatch = -1; for (int i = 0; i < Patterns.Count; i++) { var currentToken = 0; for (int j = 0; j < Patterns[i].Length; j++) { PatternUnit currentPattern = Patterns[i][j]; int ct = currentToken; bool hasMatched = false; while (ct < tokens.Length && currentPattern.IsMatch(ref tokens[ct])) { ct++; hasMatched = true; if (currentPattern.Mode == PatternMatchingMode.Single) { break; } } if (hasMatched) { currentToken = ct; } else { if (!currentPattern.Optional) { currentToken = int.MinValue; //Didn't match a mandatory token, so abort break; } } } if (largestMatch < currentToken) { largestMatch = currentToken; } } if (largestMatch > 0) { consumedTokens = largestMatch; return(true); } else { consumedTokens = 0; return(false); } }
private IEnumerable <string> GetTokens(PatternUnit pu) { if (pu.Token is object) { yield return(pu.Token); } if (pu.Set is object) { foreach (var t in pu.Set) { yield return(t); } } }
public PatternUnit(PatternMatchingMode mode, bool optional, bool caseSensitive, PatternUnitType type, PartOfSpeech[] pos, string suffix, string prefix, string shape, string token, HashSet <string> set, string entityType, HashSet <ulong> setHashes, ulong tokenHash, PatternUnit leftSide, PatternUnit rightSide) { Mode = mode; Optional = optional; CaseSensitive = caseSensitive; Type = type; POS = pos; Suffix = suffix; Prefix = prefix; Shape = shape?.AsSpan().Shape(false); Token = token; Set = set; EntityType = entityType; SetHashes = setHashes ?? (set is null ? null : new HashSet <ulong>(set.Select(tk => CaseSensitive ? PatternUnitPrototype.Hash64(tk.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(tk.AsSpan())))); TokenHash = tokenHash; LeftSide = leftSide; RightSide = rightSide; }
public PatternUnit(PatternMatchingMode mode, bool optional, bool caseSensitive, PatternUnitType type, PartOfSpeech[] pos, string suffix, string prefix, string shape, string token, HashSet <string> set, string entityType, HashSet <ulong> setHashes, ulong tokenHash, PatternUnit leftSide, PatternUnit rightSide) { Mode = mode; Optional = optional; CaseSensitive = caseSensitive; Type = type; POS = pos; Suffix = suffix; Prefix = prefix; Shape = shape?.AsSpan().Shape(false); Token = token; Set = set; EntityType = entityType; SetHashes = setHashes ?? (set is null ? null : new HashSet <ulong>(set.Select(tk => CaseSensitive ? PatternUnitPrototype.Hash64(tk.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(tk.AsSpan())))); TokenHash = tokenHash; LeftSide = leftSide; RightSide = rightSide; _splitSuffix = Suffix?.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)?.Distinct()?.ToArray(); _splitPrefix = Prefix?.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)?.Distinct()?.ToArray(); _splitEntityType = EntityType is object?new HashSet <string>(EntityType.Split(splitChar, StringSplitOptions.RemoveEmptyEntries)) : null; _splitShape = Shape is object?new HashSet <string>(Shape.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)) : null; }
public PatternUnit(IPatternUnit prototype) { var p = (PatternUnitPrototype)prototype; Mode = p.Mode; Optional = p.Optional; CaseSensitive = p.CaseSensitive; Type = p.Type; POS = p.POS; Suffix = p.Suffix; Prefix = p.Prefix; Shape = p.Shape; Token = p.Token; Set = p.Set; EntityType = p.EntityType; SetHashes = p.SetHashes ?? (p.Set is null ? null : new HashSet <ulong>(p.Set.Select(token => p.CaseSensitive ? PatternUnitPrototype.Hash64(token.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(token.AsSpan())))); TokenHash = p.TokenHash; LeftSide = p.LeftSide is object?new PatternUnit(p.LeftSide) : null; RightSide = p.RightSide is object?new PatternUnit(p.RightSide) : null; ValidChars = p.ValidChars; MinLength = p.MinLength; MaxLength = p.MaxLength; }
public bool IsSimpleForgetEntityType(PatternUnit pu, string entityType, bool ignoreCase) { return(IsSimpleForgetEntityType(pu, entityType) && pu.CaseSensitive == !ignoreCase); }
public bool IsSimpleForgetEntityType(PatternUnit pu, string entityType) { return(pu.Mode == PatternMatchingMode.Single && ((pu.Type == (PatternUnitType.Token | PatternUnitType.Entity)) || (pu.Type == (PatternUnitType.Set | PatternUnitType.Entity))) && pu.EntityType == entityType); }