//private static int HashCombine(ReadOnlySpan<char> rhs, long lhs) //{ // return Hashes.HashCombine(GetHash(rhs), lhs); //} //private static int HashCombine(string rhs, long lhs) //{ // return Hashes.HashCombine(GetHash(rhs), lhs); //} private static int HashCombine(long rhs, long lhs) { if (lhs == _HashEmpty) { return(_HashEmpty); } return(Hashes.CombineWeak(rhs, lhs)); }
public AveragePerceptronTagger(Language language, int version, string tag = "") : base(language, version, tag) { TagHashes = new int[Enum.GetValues(typeof(PartOfSpeech)).Length]; TagTagHashes = new int[Enum.GetValues(typeof(PartOfSpeech)).Length][]; foreach (var pos in Enum.GetValues(typeof(PartOfSpeech))) { TagHashes[(int)pos] = GetHash(pos.ToString()); TagTagHashes[(int)pos] = new int[Enum.GetValues(typeof(PartOfSpeech)).Length]; foreach (var pos2 in Enum.GetValues(typeof(PartOfSpeech))) { TagTagHashes[(int)pos][(int)pos2] = Hashes.CombineWeak(TagHashes[(int)pos], GetHash(pos2.ToString())); } } }
private void InitializeEntityTypes(string[] entityTypes) { Data.EntityTypes = entityTypes; Data.Tags = new string[entityTypes.Length * 4 + 1]; Data.IndexToEntityType = new Dictionary <int, string>(); Data.IndexToEntityTag = new Dictionary <int, EntityTag>(); //TagOutside must be the first in the tag list, as it's the default tag in the indexing (a.k.a IndexTagOutside) Data.Tags[0] = TagOutside.ToString(); int i = 1; foreach (var et in entityTypes) { foreach (var s in new EntityTag[] { EntityTag.Begin, EntityTag.Inside, EntityTag.End, EntityTag.Single }) { Data.Tags[i] = $"{(char)s}{Separator}{et}"; Data.IndexToEntityType.Add(i, et); Data.IndexToEntityTag.Add(i, s); i++; } } int N = Data.Tags.Length; Data.TagHashes = new int[N]; Data.TagTagHashes = new int[N][]; MapEntityTypeToTag = new Dictionary <string, int>(); for (i = 0; i < N; i++) { Data.TagHashes[i] = GetHash(Data.Tags[i]); Data.TagTagHashes[i] = new int[N]; for (int j = 0; j < N; j++) { Data.TagTagHashes[i][j] = Hashes.CombineWeak(Data.TagHashes[i], GetHash(Data.Tags[j])); } MapEntityTypeToTag.Add(Data.Tags[i], i); } }
public static string Shape(this ReadOnlySpan <char> token, bool compact) { int hash = _H_Base; int prevType = _H_Base; for (int i = 0; i < token.Length; i++) { int type; if (char.IsLower(token[i])) { type = _H_Lower; } else if (char.IsUpper(token[i])) { type = _H_Upper; } else if (char.IsNumber(token[i])) { type = _H_Digit; } else if (char.IsPunctuation(token[i])) { type = _H_Punct; } else { type = _H_Symbol; } if (!compact || type != prevType) { hash = Hashes.CombineWeak(hash, type); } prevType = type; } if (ShapesCache is null) { ShapesCache = new Dictionary <int, string>(); } string shape; if (!ShapesCache.TryGetValue(hash, out shape)) { var sb = new StringBuilder(token.Length); char prevchar = '\0', curchar = '\0'; for (int i = 0; i < token.Length; i++) { if (char.IsLower(token[i])) { curchar = 'x'; } else if (char.IsUpper(token[i])) { curchar = 'X'; } else if (char.IsNumber(token[i])) { curchar = '9'; } else if (char.IsPunctuation(token[i])) { curchar = '.'; } else { curchar = '#'; } if (!compact || curchar != prevchar) { sb.Append(curchar); } prevchar = curchar; } shape = sb.ToString(); ShapesCache[hash] = shape; } return(shape); }
public static string Shape(this ReadOnlySpan <char> token, bool compact = false) { if (token.Length == 0) { return(""); } int hash = _H_Base; int prevType = _H_Base; for (int i = 0; i < token.Length; i++) { int type; char c = token[i]; if (c == '@') { type = _H_At; } else if (c == '/' || c == '\\') { type = _H_Slash; } else if (CharacterClasses.HyphenCharacters.Contains(c)) { type = _H_Dash; } else if (char.IsLower(c)) { type = _H_Lower; } else if (char.IsUpper(c)) { type = _H_Upper; } else if (char.IsNumber(c)) { type = _H_Digit; } else if (char.IsPunctuation(c)) { type = _H_Punct; } else { type = _H_Symbol; } if (!compact || type != prevType) { hash = Hashes.CombineWeak(hash, type); } prevType = type; } string shape; if (!ShapesCache.TryGetValue(hash, out shape)) { var sb = new StringBuilder(token.Length); char prevchar = '\0', curchar; for (int i = 0; i < token.Length; i++) { var c = token[i]; if (c == '@') { curchar = '@'; } else if (c == '/' || c == '\\') { curchar = '/'; } else if (CharacterClasses.HyphenCharacters.Contains(c)) { curchar = '-'; } else if (char.IsLower(token[i])) { curchar = 'x'; } else if (char.IsUpper(token[i])) { curchar = 'X'; } else if (char.IsNumber(token[i])) { curchar = '9'; } else if (char.IsPunctuation(token[i])) { curchar = '.'; } else { curchar = '#'; } if (!compact || curchar != prevchar) { sb.Append(curchar); } prevchar = curchar; } shape = sb.ToString(); ShapesCache[hash] = shape; } return(shape); }