Exemplo n.º 1
0
        public IEnumerable <IVector> Tokenize(Memory <char> source)
        {
            var tokens = new List <IVector>();

            if (source.Length > 0)
            {
                var embedding = new SortedList <int, float>();
                var offset    = 0;
                int index     = 0;
                var span      = source.Span;

                for (; index < source.Length; index++)
                {
                    char c = char.ToLower(span[index]);

                    if (c < UnicodeStartingPoint || c > UnicodeStartingPoint + VectorWidth)
                    {
                        continue;
                    }

                    if (char.IsLetterOrDigit(c))
                    {
                        embedding.AddOrAppendToComponent(c);
                    }
                    else
                    {
                        if (embedding.Count > 0)
                        {
                            var len   = index - offset;
                            var slice = source.Slice(offset, len);

                            var vector = new IndexedVector(
                                embedding,
                                VectorWidth,
                                slice);

                            embedding.Clear();
                            tokens.Add(vector);
                        }

                        offset = index + 1;
                    }
                }

                if (embedding.Count > 0)
                {
                    var len = index - offset;

                    var vector = new IndexedVector(
                        embedding,
                        VectorWidth,
                        source.Slice(offset, len));

                    tokens.Add(vector);
                }
            }

            return(tokens);
        }
Exemplo n.º 2
0
        public IEnumerable <IVector> Tokenize(string data)
        {
            ReadOnlyMemory <char> source = data.AsMemory();

            if (source.Length > 0)
            {
                var embedding = new SortedList <int, float>();
                var offset    = 0;
                int index     = 0;

                for (; index < source.Length; index++)
                {
                    char c = char.ToLower(source.Span[index]);

                    if (char.IsLetterOrDigit(c))
                    {
                        embedding.AddOrAppendToComponent(c);
                    }
                    else
                    {
                        if (embedding.Count > 0)
                        {
                            var len = index - offset;

                            var vector = new IndexedVector(
                                embedding,
                                NumOfDimensions,
                                new string(source.Span.Slice(offset, len)));

                            embedding.Clear();
                            yield return(vector);
                        }

                        offset = index + 1;
                    }
                }

                if (embedding.Count > 0)
                {
                    var len = index - offset;

                    var vector = new IndexedVector(
                        embedding,
                        NumOfDimensions,
                        new string(source.Span.Slice(offset, len)));

                    yield return(vector);
                }
            }
        }