Пример #1
0
        //private static int HashCombine(ReadOnlySpan<char> rhs, long lhs)
        //{
        //    return Hashes.HashCombine(GetHash(rhs), lhs);
        //}

        //private static int HashCombine(string rhs, long lhs)
        //{
        //    return Hashes.HashCombine(GetHash(rhs), lhs);
        //}

        private static int HashCombine(long rhs, long lhs)
        {
            if (lhs == _HashEmpty)
            {
                return(_HashEmpty);
            }
            return(Hashes.CombineWeak(rhs, lhs));
        }
Пример #2
0
 public AveragePerceptronTagger(Language language, int version, string tag = "") : base(language, version, tag)
 {
     TagHashes    = new int[Enum.GetValues(typeof(PartOfSpeech)).Length];
     TagTagHashes = new int[Enum.GetValues(typeof(PartOfSpeech)).Length][];
     foreach (var pos in Enum.GetValues(typeof(PartOfSpeech)))
     {
         TagHashes[(int)pos]    = GetHash(pos.ToString());
         TagTagHashes[(int)pos] = new int[Enum.GetValues(typeof(PartOfSpeech)).Length];
         foreach (var pos2 in Enum.GetValues(typeof(PartOfSpeech)))
         {
             TagTagHashes[(int)pos][(int)pos2] = Hashes.CombineWeak(TagHashes[(int)pos], GetHash(pos2.ToString()));
         }
     }
 }
Пример #3
0
        private void InitializeEntityTypes(string[] entityTypes)
        {
            Data.EntityTypes       = entityTypes;
            Data.Tags              = new string[entityTypes.Length * 4 + 1];
            Data.IndexToEntityType = new Dictionary <int, string>();
            Data.IndexToEntityTag  = new Dictionary <int, EntityTag>();

            //TagOutside must be the first in the tag list, as it's the default tag in the indexing (a.k.a IndexTagOutside)
            Data.Tags[0] = TagOutside.ToString();

            int i = 1;

            foreach (var et in entityTypes)
            {
                foreach (var s in new EntityTag[] { EntityTag.Begin, EntityTag.Inside, EntityTag.End, EntityTag.Single })
                {
                    Data.Tags[i] = $"{(char)s}{Separator}{et}";
                    Data.IndexToEntityType.Add(i, et);
                    Data.IndexToEntityTag.Add(i, s);
                    i++;
                }
            }

            int N = Data.Tags.Length;

            Data.TagHashes    = new int[N];
            Data.TagTagHashes = new int[N][];

            MapEntityTypeToTag = new Dictionary <string, int>();

            for (i = 0; i < N; i++)
            {
                Data.TagHashes[i]    = GetHash(Data.Tags[i]);
                Data.TagTagHashes[i] = new int[N];
                for (int j = 0; j < N; j++)
                {
                    Data.TagTagHashes[i][j] = Hashes.CombineWeak(Data.TagHashes[i], GetHash(Data.Tags[j]));
                }

                MapEntityTypeToTag.Add(Data.Tags[i], i);
            }
        }
        public static string Shape(this ReadOnlySpan <char> token, bool compact)
        {
            int hash     = _H_Base;
            int prevType = _H_Base;

            for (int i = 0; i < token.Length; i++)
            {
                int type;
                if (char.IsLower(token[i]))
                {
                    type = _H_Lower;
                }
                else if (char.IsUpper(token[i]))
                {
                    type = _H_Upper;
                }
                else if (char.IsNumber(token[i]))
                {
                    type = _H_Digit;
                }
                else if (char.IsPunctuation(token[i]))
                {
                    type = _H_Punct;
                }
                else
                {
                    type = _H_Symbol;
                }

                if (!compact || type != prevType)
                {
                    hash = Hashes.CombineWeak(hash, type);
                }
                prevType = type;
            }


            if (ShapesCache is null)
            {
                ShapesCache = new Dictionary <int, string>();
            }

            string shape;

            if (!ShapesCache.TryGetValue(hash, out shape))
            {
                var  sb = new StringBuilder(token.Length);
                char prevchar = '\0', curchar = '\0';
                for (int i = 0; i < token.Length; i++)
                {
                    if (char.IsLower(token[i]))
                    {
                        curchar = 'x';
                    }
                    else if (char.IsUpper(token[i]))
                    {
                        curchar = 'X';
                    }
                    else if (char.IsNumber(token[i]))
                    {
                        curchar = '9';
                    }
                    else if (char.IsPunctuation(token[i]))
                    {
                        curchar = '.';
                    }
                    else
                    {
                        curchar = '#';
                    }

                    if (!compact || curchar != prevchar)
                    {
                        sb.Append(curchar);
                    }
                    prevchar = curchar;
                }
                shape             = sb.ToString();
                ShapesCache[hash] = shape;
            }
            return(shape);
        }
Пример #5
0
        public static string Shape(this ReadOnlySpan <char> token, bool compact = false)
        {
            if (token.Length == 0)
            {
                return("");
            }

            int hash     = _H_Base;
            int prevType = _H_Base;

            for (int i = 0; i < token.Length; i++)
            {
                int  type;
                char c = token[i];

                if (c == '@')
                {
                    type = _H_At;
                }
                else if (c == '/' || c == '\\')
                {
                    type = _H_Slash;
                }
                else if (CharacterClasses.HyphenCharacters.Contains(c))
                {
                    type = _H_Dash;
                }
                else if (char.IsLower(c))
                {
                    type = _H_Lower;
                }
                else if (char.IsUpper(c))
                {
                    type = _H_Upper;
                }
                else if (char.IsNumber(c))
                {
                    type = _H_Digit;
                }
                else if (char.IsPunctuation(c))
                {
                    type = _H_Punct;
                }
                else
                {
                    type = _H_Symbol;
                }

                if (!compact || type != prevType)
                {
                    hash = Hashes.CombineWeak(hash, type);
                }

                prevType = type;
            }

            string shape;

            if (!ShapesCache.TryGetValue(hash, out shape))
            {
                var  sb = new StringBuilder(token.Length);
                char prevchar = '\0', curchar;
                for (int i = 0; i < token.Length; i++)
                {
                    var c = token[i];
                    if (c == '@')
                    {
                        curchar = '@';
                    }
                    else if (c == '/' || c == '\\')
                    {
                        curchar = '/';
                    }
                    else if (CharacterClasses.HyphenCharacters.Contains(c))
                    {
                        curchar = '-';
                    }
                    else if (char.IsLower(token[i]))
                    {
                        curchar = 'x';
                    }
                    else if (char.IsUpper(token[i]))
                    {
                        curchar = 'X';
                    }
                    else if (char.IsNumber(token[i]))
                    {
                        curchar = '9';
                    }
                    else if (char.IsPunctuation(token[i]))
                    {
                        curchar = '.';
                    }
                    else
                    {
                        curchar = '#';
                    }

                    if (!compact || curchar != prevchar)
                    {
                        sb.Append(curchar);
                    }

                    prevchar = curchar;
                }
                shape             = sb.ToString();
                ShapesCache[hash] = shape;
            }
            return(shape);
        }