Пример #1
0
        public AbbreviationCapturer(Language language)
        {
            Language = language;
            var commonWords = AbbreviationCapturerCommonWords.Get(language);

            DiscardCommonWords = new PatternUnit(PatternUnitPrototype.Single().WithTokens(commonWords, ignoreCase: true));
            Stopwords          = new HashSet <ulong>(StopWords.Spacy.For(Language).Select(w => w.AsSpan().IgnoreCaseHash64()).ToArray());
        }
Пример #2
0
        public bool IsMatch(Span <Token> tokens, out int consumedTokens)
        {
            int largestMatch = -1;

            for (int i = 0; i < Patterns.Count; i++)
            {
                var currentToken = 0;
                for (int j = 0; j < Patterns[i].Length; j++)
                {
                    PatternUnit currentPattern = Patterns[i][j];
                    int         ct             = currentToken;
                    bool        hasMatched     = false;

                    while (ct < tokens.Length && currentPattern.IsMatch(ref tokens[ct]))
                    {
                        ct++;
                        hasMatched = true;
                        if (currentPattern.Mode == PatternMatchingMode.Single)
                        {
                            break;
                        }
                    }

                    if (hasMatched)
                    {
                        currentToken = ct;
                    }
                    else
                    {
                        if (!currentPattern.Optional)
                        {
                            currentToken = int.MinValue; //Didn't match a mandatory token, so abort
                            break;
                        }
                    }
                }
                if (largestMatch < currentToken)
                {
                    largestMatch = currentToken;
                }
            }

            if (largestMatch > 0)
            {
                consumedTokens = largestMatch;
                return(true);
            }
            else
            {
                consumedTokens = 0;
                return(false);
            }
        }
Пример #3
0
 private IEnumerable <string> GetTokens(PatternUnit pu)
 {
     if (pu.Token is object)
     {
         yield return(pu.Token);
     }
     if (pu.Set is object)
     {
         foreach (var t in pu.Set)
         {
             yield return(t);
         }
     }
 }
Пример #4
0
 public PatternUnit(PatternMatchingMode mode, bool optional, bool caseSensitive, PatternUnitType type, PartOfSpeech[] pos, string suffix, string prefix, string shape, string token, HashSet <string> set, string entityType, HashSet <ulong> setHashes, ulong tokenHash, PatternUnit leftSide, PatternUnit rightSide)
 {
     Mode          = mode;
     Optional      = optional;
     CaseSensitive = caseSensitive;
     Type          = type;
     POS           = pos;
     Suffix        = suffix;
     Prefix        = prefix;
     Shape         = shape?.AsSpan().Shape(false);
     Token         = token;
     Set           = set;
     EntityType    = entityType;
     SetHashes     = setHashes ?? (set is null ? null : new HashSet <ulong>(set.Select(tk => CaseSensitive ? PatternUnitPrototype.Hash64(tk.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(tk.AsSpan()))));
     TokenHash     = tokenHash;
     LeftSide      = leftSide;
     RightSide     = rightSide;
 }
Пример #5
0
        public PatternUnit(PatternMatchingMode mode, bool optional, bool caseSensitive, PatternUnitType type, PartOfSpeech[] pos, string suffix, string prefix, string shape, string token, HashSet <string> set, string entityType, HashSet <ulong> setHashes, ulong tokenHash, PatternUnit leftSide, PatternUnit rightSide)
        {
            Mode          = mode;
            Optional      = optional;
            CaseSensitive = caseSensitive;
            Type          = type;
            POS           = pos;
            Suffix        = suffix;
            Prefix        = prefix;
            Shape         = shape?.AsSpan().Shape(false);
            Token         = token;
            Set           = set;
            EntityType    = entityType;
            SetHashes     = setHashes ?? (set is null ? null : new HashSet <ulong>(set.Select(tk => CaseSensitive ? PatternUnitPrototype.Hash64(tk.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(tk.AsSpan()))));
            TokenHash     = tokenHash;
            LeftSide      = leftSide;
            RightSide     = rightSide;

            _splitSuffix     = Suffix?.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)?.Distinct()?.ToArray();
            _splitPrefix     = Prefix?.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)?.Distinct()?.ToArray();
            _splitEntityType = EntityType is object?new HashSet <string>(EntityType.Split(splitChar, StringSplitOptions.RemoveEmptyEntries)) : null;
            _splitShape      = Shape is object?new HashSet <string>(Shape.Split(splitCharWithWhitespaces, StringSplitOptions.RemoveEmptyEntries)) : null;
        }
Пример #6
0
        public PatternUnit(IPatternUnit prototype)
        {
            var p = (PatternUnitPrototype)prototype;

            Mode          = p.Mode;
            Optional      = p.Optional;
            CaseSensitive = p.CaseSensitive;
            Type          = p.Type;
            POS           = p.POS;
            Suffix        = p.Suffix;
            Prefix        = p.Prefix;
            Shape         = p.Shape;
            Token         = p.Token;
            Set           = p.Set;
            EntityType    = p.EntityType;
            SetHashes     = p.SetHashes ?? (p.Set is null ? null : new HashSet <ulong>(p.Set.Select(token => p.CaseSensitive ? PatternUnitPrototype.Hash64(token.AsSpan()) : PatternUnitPrototype.IgnoreCaseHash64(token.AsSpan()))));
            TokenHash     = p.TokenHash;
            LeftSide      = p.LeftSide is object?new PatternUnit(p.LeftSide) : null;
            RightSide     = p.RightSide is object?new PatternUnit(p.RightSide) : null;
            ValidChars    = p.ValidChars;
            MinLength     = p.MinLength;
            MaxLength     = p.MaxLength;
        }
Пример #7
0
 public bool IsSimpleForgetEntityType(PatternUnit pu, string entityType, bool ignoreCase)
 {
     return(IsSimpleForgetEntityType(pu, entityType) && pu.CaseSensitive == !ignoreCase);
 }
Пример #8
0
 public bool IsSimpleForgetEntityType(PatternUnit pu, string entityType)
 {
     return(pu.Mode == PatternMatchingMode.Single && ((pu.Type == (PatternUnitType.Token | PatternUnitType.Entity)) || (pu.Type == (PatternUnitType.Set | PatternUnitType.Entity))) && pu.EntityType == entityType);
 }