//public void RemoveForgetPattern(string entityType, string name) //{ // if (!Data.AutoOptimizePatterns.Value) { return; } // RWLock.EnterWriteLock(); // try // { // if (Data.RemovePatternsPerEntityType.TryGetValue(entityType, out var list)) // { // list.RemoveAll(p => p.Name == name); // if (list.Count == 0) { Data.RemovePatternsPerEntityType.Remove(entityType); } // } // } // finally // { // RWLock.ExitWriteLock(); // } //} //public void RemoveAddPattern(string entityType, string name) //{ // RWLock.EnterWriteLock(); // try // { // if (Data.AddPatternPerEntityType.TryGetValue(entityType, out var mp)) // { // mp.Patterns.RemoveAll(p => p.Name == name); // if (mp.Patterns.Count == 0) { Data.AddPatternPerEntityType.Remove(entityType); } // RWLock.ExitWriteLock(); // } // } // finally // { // RWLock.ExitWriteLock(); // } //} public void OptimizeMatchingPattern(string entityType, MatchingPattern mp, bool isAdd) { var patterns = mp.Patterns.ToArray(); mp.Patterns.Clear(); var ignorePatterns = patterns.Where(p => p.All(pu => isAdd ? !IsSimpleAddEntityType(pu, entityType) : !IsSimpleForgetEntityType(pu, entityType))).ToList(); mp.Patterns.AddRange(ignorePatterns); foreach (var ignoreCase in new bool[] { false, true }) { var words = new Dictionary <int, List <List <string[]> > >(); foreach (var mergePatterns in patterns.Where(p => p.All(pu => isAdd ? IsSimpleAddEntityType(pu, entityType, ignoreCase) : IsSimpleForgetEntityType(pu, entityType, ignoreCase)))) { if (!words.TryGetValue(mergePatterns.Length, out var wordList)) { wordList = new List <List <string[]> >(); words[mergePatterns.Length] = wordList; } wordList.Add(mergePatterns.Select(pu => GetTokens(pu).ToArray()).ToList()); } foreach (var kv in words) { var len = kv.Key; var wordList = kv.Value; var newPatterns = new List <IPatternUnit>(); var hs = new HashSet <string> [len]; for (int i = 0; i < len; i++) { hs[i] = new HashSet <string>(); } foreach (var wl in wordList) { for (int i = 0; i < len; i++) { foreach (var w in wl[i]) { hs[i].Add(w); } } } newPatterns.AddRange(hs.Select(h => PatternUnitPrototype.Single().WithTokens(h, ignoreCase))); foreach (var pu in newPatterns) { if (isAdd) { pu.WithoutEntityType(entityType); } else { pu.WithEntityType(entityType); } } mp.Patterns.Add(newPatterns.Select(prot => new PatternUnit(prot)).ToArray()); } } }
public AbbreviationCapturer(Language language) { Language = language; var commonWords = AbbreviationCapturerCommonWords.Get(language); DiscardCommonWords = new PatternUnit(PatternUnitPrototype.Single().WithTokens(commonWords, ignoreCase: true)); Stopwords = new HashSet <ulong>(StopWords.Spacy.For(Language).Select(w => w.AsSpan().IgnoreCaseHash64()).ToArray()); }
private bool MatchSet(ref Token token) { if (SetHashes is null) { //No need to lock here, as we would just replace one with another equal set if there is a colision SetHashes = new HashSet <ulong>(Set.Select(tk => CaseSensitive ? PatternUnitPrototype.Hash64(tk.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(tk.AsSpan()))); } return(SetHashes.Contains(GetTokenHash(ref token))); }
private ulong GetTokenHash(ref Token token) { return(CaseSensitive ? PatternUnitPrototype.Hash64(token.ValueAsSpan) : PatternUnitPrototype.IgnoreCaseHash64(token.ValueAsSpan)); }
public PatternUnit(PatternMatchingMode mode, bool optional, bool caseSensitive, PatternUnitType type, PartOfSpeech[] pos, string suffix, string prefix, string shape, string token, HashSet <string> set, string entityType, HashSet <ulong> setHashes, ulong tokenHash, PatternUnit leftSide, PatternUnit rightSide) { Mode = mode; Optional = optional; CaseSensitive = caseSensitive; Type = type; POS = pos; Suffix = suffix; Prefix = prefix; Shape = shape?.AsSpan().Shape(false); Token = token; Set = set; EntityType = entityType; SetHashes = setHashes ?? (set is null ? null : new HashSet <ulong>(set.Select(tk => CaseSensitive ? PatternUnitPrototype.Hash64(tk.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(tk.AsSpan())))); TokenHash = tokenHash; LeftSide = leftSide; RightSide = rightSide; _splitSuffix = Suffix?.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)?.Distinct()?.ToArray(); _splitPrefix = Prefix?.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)?.Distinct()?.ToArray(); _splitEntityType = EntityType is object?new HashSet <string>(EntityType.Split(splitChar, StringSplitOptions.RemoveEmptyEntries)) : null; _splitShape = Shape is object?new HashSet <string>(Shape.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)) : null; }
public PatternUnit(IPatternUnit prototype) { var p = (PatternUnitPrototype)prototype; Mode = p.Mode; Optional = p.Optional; CaseSensitive = p.CaseSensitive; Type = p.Type; POS = p.POS; Suffix = p.Suffix; Prefix = p.Prefix; Shape = p.Shape; Token = p.Token; Set = p.Set; EntityType = p.EntityType; SetHashes = p.SetHashes ?? (p.Set is null ? null : new HashSet <ulong>(p.Set.Select(token => p.CaseSensitive ? PatternUnitPrototype.Hash64(token.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(token.AsSpan())))); TokenHash = p.TokenHash; LeftSide = p.LeftSide is object?new PatternUnit(p.LeftSide) : null; RightSide = p.RightSide is object?new PatternUnit(p.RightSide) : null; ValidChars = p.ValidChars; MinLength = p.MinLength; MaxLength = p.MaxLength; _splitSuffix = Suffix?.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)?.Distinct()?.ToArray(); _splitPrefix = Prefix?.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)?.Distinct()?.ToArray(); _splitEntityType = EntityType is object?new HashSet <string>(EntityType.Split(splitChar, StringSplitOptions.RemoveEmptyEntries)) : null; _splitShape = Shape is object?new HashSet <string>(Shape.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)) : null; }
public PatternUnit(PatternMatchingMode mode, bool optional, bool caseSensitive, PatternUnitType type, PartOfSpeech[] pos, string suffix, string prefix, string shape, string token, HashSet <string> set, string entityType, HashSet <ulong> setHashes, ulong tokenHash, PatternUnit leftSide, PatternUnit rightSide) { Mode = mode; Optional = optional; CaseSensitive = caseSensitive; Type = type; POS = pos; Suffix = suffix; Prefix = prefix; Shape = shape?.AsSpan().Shape(false); Token = token; Set = set; EntityType = entityType; SetHashes = setHashes ?? (set is null ? null : new HashSet <ulong>(set.Select(tk => CaseSensitive ? PatternUnitPrototype.Hash64(tk.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(tk.AsSpan())))); TokenHash = tokenHash; LeftSide = leftSide; RightSide = rightSide; }
public PatternUnit(IPatternUnit prototype) { var p = (PatternUnitPrototype)prototype; Mode = p.Mode; Optional = p.Optional; CaseSensitive = p.CaseSensitive; Type = p.Type; POS = p.POS; Suffix = p.Suffix; Prefix = p.Prefix; Shape = p.Shape; Token = p.Token; Set = p.Set; EntityType = p.EntityType; SetHashes = p.SetHashes ?? (p.Set is null ? null : new HashSet <ulong>(p.Set.Select(token => p.CaseSensitive ? PatternUnitPrototype.Hash64(token.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(token.AsSpan())))); TokenHash = p.TokenHash; LeftSide = p.LeftSide is object?new PatternUnit(p.LeftSide) : null; RightSide = p.RightSide is object?new PatternUnit(p.RightSide) : null; ValidChars = p.ValidChars; MinLength = p.MinLength; MaxLength = p.MaxLength; }