Пример #1
0
        //public void RemoveForgetPattern(string entityType, string name)
        //{
        //    if (!Data.AutoOptimizePatterns.Value) { return; }
        //    RWLock.EnterWriteLock();
        //    try
        //    {
        //        if (Data.RemovePatternsPerEntityType.TryGetValue(entityType, out var list))
        //        {
        //            list.RemoveAll(p => p.Name == name);
        //            if (list.Count == 0) { Data.RemovePatternsPerEntityType.Remove(entityType); }
        //        }
        //    }
        //    finally
        //    {
        //        RWLock.ExitWriteLock();
        //    }
        //}

        //public void RemoveAddPattern(string entityType, string name)
        //{
        //    RWLock.EnterWriteLock();

        //    try
        //    {
        //        if (Data.AddPatternPerEntityType.TryGetValue(entityType, out var mp))
        //        {
        //            mp.Patterns.RemoveAll(p => p.Name == name);
        //            if (mp.Patterns.Count == 0) { Data.AddPatternPerEntityType.Remove(entityType); }

        //            RWLock.ExitWriteLock();
        //        }
        //    }
        //    finally
        //    {
        //        RWLock.ExitWriteLock();
        //    }
        //}

        public void OptimizeMatchingPattern(string entityType, MatchingPattern mp, bool isAdd)
        {
            var patterns = mp.Patterns.ToArray();

            mp.Patterns.Clear();

            var ignorePatterns = patterns.Where(p => p.All(pu => isAdd ? !IsSimpleAddEntityType(pu, entityType) : !IsSimpleForgetEntityType(pu, entityType))).ToList();

            mp.Patterns.AddRange(ignorePatterns);

            foreach (var ignoreCase in new bool[] { false, true })
            {
                var words = new Dictionary <int, List <List <string[]> > >();
                foreach (var mergePatterns in patterns.Where(p => p.All(pu => isAdd ? IsSimpleAddEntityType(pu, entityType, ignoreCase) : IsSimpleForgetEntityType(pu, entityType, ignoreCase))))
                {
                    if (!words.TryGetValue(mergePatterns.Length, out var wordList))
                    {
                        wordList = new List <List <string[]> >();
                        words[mergePatterns.Length] = wordList;
                    }
                    wordList.Add(mergePatterns.Select(pu => GetTokens(pu).ToArray()).ToList());
                }

                foreach (var kv in words)
                {
                    var len = kv.Key; var wordList = kv.Value;
                    var newPatterns = new List <IPatternUnit>();

                    var hs = new HashSet <string> [len];
                    for (int i = 0; i < len; i++)
                    {
                        hs[i] = new HashSet <string>();
                    }
                    foreach (var wl in wordList)
                    {
                        for (int i = 0; i < len; i++)
                        {
                            foreach (var w in wl[i])
                            {
                                hs[i].Add(w);
                            }
                        }
                    }
                    newPatterns.AddRange(hs.Select(h => PatternUnitPrototype.Single().WithTokens(h, ignoreCase)));

                    foreach (var pu in newPatterns)
                    {
                        if (isAdd)
                        {
                            pu.WithoutEntityType(entityType);
                        }
                        else
                        {
                            pu.WithEntityType(entityType);
                        }
                    }
                    mp.Patterns.Add(newPatterns.Select(prot => new PatternUnit(prot)).ToArray());
                }
            }
        }
Пример #2
0
        public AbbreviationCapturer(Language language)
        {
            Language = language;
            var commonWords = AbbreviationCapturerCommonWords.Get(language);

            DiscardCommonWords = new PatternUnit(PatternUnitPrototype.Single().WithTokens(commonWords, ignoreCase: true));
            Stopwords          = new HashSet <ulong>(StopWords.Spacy.For(Language).Select(w => w.AsSpan().IgnoreCaseHash64()).ToArray());
        }
Пример #3
0
 private bool MatchSet(ref Token token)
 {
     if (SetHashes is null)
     {
         //No need to lock here, as we would just replace one with another equal set if there is a colision
         SetHashes = new HashSet <ulong>(Set.Select(tk => CaseSensitive ? PatternUnitPrototype.Hash64(tk.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(tk.AsSpan())));
     }
     return(SetHashes.Contains(GetTokenHash(ref token)));
 }
Пример #4
0
 private ulong GetTokenHash(ref Token token)
 {
     return(CaseSensitive ? PatternUnitPrototype.Hash64(token.ValueAsSpan) : PatternUnitPrototype.IgnoreCaseHash64(token.ValueAsSpan));
 }
Пример #5
0
        public PatternUnit(PatternMatchingMode mode, bool optional, bool caseSensitive, PatternUnitType type, PartOfSpeech[] pos, string suffix, string prefix, string shape, string token, HashSet <string> set, string entityType, HashSet <ulong> setHashes, ulong tokenHash, PatternUnit leftSide, PatternUnit rightSide)
        {
            Mode          = mode;
            Optional      = optional;
            CaseSensitive = caseSensitive;
            Type          = type;
            POS           = pos;
            Suffix        = suffix;
            Prefix        = prefix;
            Shape         = shape?.AsSpan().Shape(false);
            Token         = token;
            Set           = set;
            EntityType    = entityType;
            SetHashes     = setHashes ?? (set is null ? null : new HashSet <ulong>(set.Select(tk => CaseSensitive ? PatternUnitPrototype.Hash64(tk.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(tk.AsSpan()))));
            TokenHash     = tokenHash;
            LeftSide      = leftSide;
            RightSide     = rightSide;

            _splitSuffix     = Suffix?.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)?.Distinct()?.ToArray();
            _splitPrefix     = Prefix?.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)?.Distinct()?.ToArray();
            _splitEntityType = EntityType is object?new HashSet <string>(EntityType.Split(splitChar, StringSplitOptions.RemoveEmptyEntries)) : null;
            _splitShape      = Shape is object?new HashSet <string>(Shape.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)) : null;
        }
Пример #6
0
        public PatternUnit(IPatternUnit prototype)
        {
            var p = (PatternUnitPrototype)prototype;

            Mode          = p.Mode;
            Optional      = p.Optional;
            CaseSensitive = p.CaseSensitive;
            Type          = p.Type;
            POS           = p.POS;
            Suffix        = p.Suffix;
            Prefix        = p.Prefix;
            Shape         = p.Shape;
            Token         = p.Token;
            Set           = p.Set;
            EntityType    = p.EntityType;
            SetHashes     = p.SetHashes ?? (p.Set is null ? null : new HashSet <ulong>(p.Set.Select(token => p.CaseSensitive ? PatternUnitPrototype.Hash64(token.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(token.AsSpan()))));
            TokenHash     = p.TokenHash;
            LeftSide      = p.LeftSide is object?new PatternUnit(p.LeftSide) : null;
            RightSide     = p.RightSide is object?new PatternUnit(p.RightSide) : null;
            ValidChars    = p.ValidChars;
            MinLength     = p.MinLength;
            MaxLength     = p.MaxLength;

            _splitSuffix     = Suffix?.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)?.Distinct()?.ToArray();
            _splitPrefix     = Prefix?.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)?.Distinct()?.ToArray();
            _splitEntityType = EntityType is object?new HashSet <string>(EntityType.Split(splitChar, StringSplitOptions.RemoveEmptyEntries)) : null;
            _splitShape      = Shape is object?new HashSet <string>(Shape.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)) : null;
        }
Пример #7
0
 public PatternUnit(PatternMatchingMode mode, bool optional, bool caseSensitive, PatternUnitType type, PartOfSpeech[] pos, string suffix, string prefix, string shape, string token, HashSet <string> set, string entityType, HashSet <ulong> setHashes, ulong tokenHash, PatternUnit leftSide, PatternUnit rightSide)
 {
     Mode          = mode;
     Optional      = optional;
     CaseSensitive = caseSensitive;
     Type          = type;
     POS           = pos;
     Suffix        = suffix;
     Prefix        = prefix;
     Shape         = shape?.AsSpan().Shape(false);
     Token         = token;
     Set           = set;
     EntityType    = entityType;
     SetHashes     = setHashes ?? (set is null ? null : new HashSet <ulong>(set.Select(tk => CaseSensitive ? PatternUnitPrototype.Hash64(tk.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(tk.AsSpan()))));
     TokenHash     = tokenHash;
     LeftSide      = leftSide;
     RightSide     = rightSide;
 }
Пример #8
0
        public PatternUnit(IPatternUnit prototype)
        {
            var p = (PatternUnitPrototype)prototype;

            Mode          = p.Mode;
            Optional      = p.Optional;
            CaseSensitive = p.CaseSensitive;
            Type          = p.Type;
            POS           = p.POS;
            Suffix        = p.Suffix;
            Prefix        = p.Prefix;
            Shape         = p.Shape;
            Token         = p.Token;
            Set           = p.Set;
            EntityType    = p.EntityType;
            SetHashes     = p.SetHashes ?? (p.Set is null ? null : new HashSet <ulong>(p.Set.Select(token => p.CaseSensitive ? PatternUnitPrototype.Hash64(token.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(token.AsSpan()))));
            TokenHash     = p.TokenHash;
            LeftSide      = p.LeftSide is object?new PatternUnit(p.LeftSide) : null;
            RightSide     = p.RightSide is object?new PatternUnit(p.RightSide) : null;
            ValidChars    = p.ValidChars;
            MinLength     = p.MinLength;
            MaxLength     = p.MaxLength;
        }