Пример #1
0
        public void Add(ulong collectionId, long keyId, VectorNode index)
        {
            SortedList <long, VectorNode> ix;

            if (!_ix.TryGetValue(collectionId, out ix))
            {
                lock (_sync)
                {
                    if (!_ix.TryGetValue(collectionId, out ix))
                    {
                        ix = new SortedList <long, VectorNode>();
                        _ix.Add(collectionId, ix);
                    }
                }
            }

            if (!ix.ContainsKey(keyId))
            {
                lock (_sync)
                {
                    if (!ix.ContainsKey(keyId))
                    {
                        ix.Add(keyId, index);
                    }
                }
            }
        }
Пример #2
0
 public BuildJob(ulong collectionId, ulong docId, IEnumerable <string> tokens, VectorNode index)
 {
     CollectionId = collectionId;
     DocId        = docId;
     Tokens       = tokens;
     Index        = index;
 }
Пример #3
0
        public static void SerializeNode(VectorNode node, Stream stream)
        {
            long terminator = 1;

            if (node.Left == null && node.Right == null) // there are no children
            {
                terminator = 3;
            }
            else if (node.Left == null) // there is a right but no left
            {
                terminator = 2;
            }
            else if (node.Right == null) // there is a left but no right
            {
                terminator = 1;
            }
            else // there is a left and a right
            {
                terminator = 0;
            }

            stream.Write(BitConverter.GetBytes(node.VectorOffset));
            stream.Write(BitConverter.GetBytes(node.PostingsOffset));
            stream.Write(BitConverter.GetBytes((long)node.Vector.Count));
            stream.Write(BitConverter.GetBytes(node.Weight));
            stream.Write(BitConverter.GetBytes(terminator));
        }
Пример #4
0
        public static (int depth, int width, int avgDepth) Size(VectorNode root)
        {
            var width    = 0;
            var depth    = 1;
            var node     = root;
            var aggDepth = 0;
            var count    = 0;

            while (node != null)
            {
                var d = Depth(node);
                if (d > depth)
                {
                    depth = d;
                }

                aggDepth += d;
                count++;
                width++;

                node = node.Right;
            }

            return(depth, width, aggDepth / count);
        }
Пример #5
0
        public static (long offset, long length) SerializeTree(
            VectorNode node, Stream indexStream, Stream vectorStream, Stream postingsStream, IStringModel tokenizer)
        {
            var stack  = new Stack <VectorNode>();
            var offset = indexStream.Position;

            if (node.Vector.Count == 0)
            {
                node = node.Right;
            }

            while (node != null)
            {
                SerializePostings(node, postingsStream);
                node.VectorOffset = tokenizer.SerializeVector(node.Vector, vectorStream);
                SerializeNode(node, indexStream);

                if (node.Right != null)
                {
                    stack.Push(node.Right);
                }

                node = node.Left;

                if (node == null && stack.Count > 0)
                {
                    node = stack.Pop();
                }
            }

            var length = indexStream.Position - offset;

            return(offset, length);
        }
Пример #6
0
        public static string Visualize(VectorNode root)
        {
            StringBuilder output = new StringBuilder();

            Visualize(root, output, 0);
            return(output.ToString());
        }
Пример #7
0
        public async Task SerializeColumnSegment(VectorNode column)
        {
            var time = Stopwatch.StartNew();

            if (_postingsWriter != null)
            {
                await _postingsWriter.Write(column);
            }

            lock (_indexFileSync)
            {
                var page = column.SerializeTree(_ixStream);

                _ixStream.Flush();

                _pageIndexWriter.Write(page.offset, page.length);

                _pageIndexWriter.Flush();
            }

            var size = column.Size();

            this.Log("serialized column {0} in {1}. weight {2} depth {3} width {4} (avg depth {5})",
                     _keyId, time.Elapsed, column.Weight, size.depth, size.width, size.avgDepth);
        }
Пример #8
0
        public static IEnumerable <VectorNode> All(VectorNode root)
        {
            var node  = root;
            var stack = new Stack <VectorNode>();

            while (node != null)
            {
                yield return(node);

                if (node.Right != null)
                {
                    stack.Push(node.Right);
                }

                node = node.Left;

                if (node == null)
                {
                    if (stack.Count > 0)
                    {
                        node = stack.Pop();
                    }
                }
            }
        }
Пример #9
0
        public static Hit FindFirstNonSimilar(VectorNode root, SortedList <long, int> vector, float foldAngle)
        {
            var cursor = root;

            while (cursor != null)
            {
                var angle = vector.CosAngle(cursor.Vector);

                if (angle < foldAngle)
                {
                    return(new Hit
                    {
                        Score = angle,
                        Node = cursor
                    });
                }
                else if (cursor.Right != null)
                {
                    cursor = cursor.Right;
                }
                else
                {
                    cursor = cursor.Left;
                }
            }

            return(new Hit());
        }
Пример #10
0
        private static VectorTree DeserializeTree(string dir)
        {
            var ix = new SortedList <ulong, SortedList <uint, VectorNode> >();

            foreach (var ixFileName in Directory.GetFiles(dir, "*.ix"))
            {
                var name    = Path.GetFileNameWithoutExtension(ixFileName).Split(".", StringSplitOptions.RemoveEmptyEntries);
                var colHash = ulong.Parse(name[0]);
                var keyId   = uint.Parse(name[1]);
                SortedList <uint, VectorNode> colIx;

                if (!ix.TryGetValue(colHash, out colIx))
                {
                    colIx = new SortedList <uint, VectorNode>();
                    ix.Add(colHash, colIx);
                }

                using (var treeStream = File.OpenRead(ixFileName))
                    using (var vecStream = File.OpenRead(Path.Combine(dir, string.Format("{0}.vec", colHash))))
                    {
                        var root = VectorNode.Deserialize(treeStream, vecStream);

                        ix[colHash].Add(keyId, root);
                    }
            }

            return(new VectorTree(ix));
        }
Пример #11
0
 public VectorNode DeserializeIndex(string ixFileName, string vecFileName)
 {
     using (var treeStream = new FileStream(ixFileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
         using (var vecStream = new FileStream(vecFileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
         {
             return(VectorNode.Deserialize(treeStream, vecStream));
         }
 }
Пример #12
0
 private async Task <VectorNode> DeserializeIndex(string ixFileName, string vecFileName)
 {
     using (var treeStream = new FileStream(ixFileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite, 4096, true))
         using (var vecStream = new FileStream(vecFileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite, 4096, true))
         {
             return(await VectorNode.Deserialize(treeStream, vecStream));
         }
 }
Пример #13
0
        public static void SerializePostings(VectorNode node, Stream postingsStream)
        {
            var offset = postingsStream.Position;

            postingsStream.Write(node.DocIds.ToStreamWithHeader(node.DocIds.Count));

            node.PostingsOffset = offset;
        }
Пример #14
0
 private async Task SerializeColumn(long keyId, VectorNode column)
 {
     using (var columnWriter = new ColumnSerializer(
                CollectionId, keyId, SessionFactory, ixFileExtension: "ixo", pageFileExtension: "ixop"))
     {
         await columnWriter.SerializeColumnSegment(column);
     }
 }
Пример #15
0
        public static int Depth(VectorNode node)
        {
            var count = 0;

            while (node != null)
            {
                count++;
                node = node.Left;
            }
            return(count);
        }
Пример #16
0
        public static SortedList <long, int> Compress(VectorNode root)
        {
            var vector = new SortedList <long, int>();

            foreach (var node in All(root))
            {
                vector = VectorOperations.Merge(vector, node.Vector);
            }

            return(vector);
        }
Пример #17
0
        public static Vector Compress(VectorNode root)
        {
            var vector = new Vector(new int[0]);

            foreach (var node in PathFinder.All(root))
            {
                vector = vector.Add(node.Vector);
            }

            return(vector);
        }
Пример #18
0
        public static void MergeDocIds(VectorNode target, VectorNode node)
        {
            if (target.DocIds == null || node.DocIds == null)
            {
                return;
            }

            foreach (var docId in node.DocIds)
            {
                target.DocIds.Add(docId);
            }
        }
Пример #19
0
        public bool TryGetIndex(ulong collectionId, long keyId, out VectorNode index)
        {
            var colIndex = _index.GetIndex(collectionId);

            if (colIndex != null)
            {
                return(colIndex.TryGetValue(keyId, out index));
            }

            index = null;

            return(false);
        }
Пример #20
0
        private void BuildInMemoryIndex(ulong docId, long keyId, VectorNode index, IEnumerable <string> tokens)
        {
            var count = 0;

            using (var vectorStream = SessionFactory.CreateAppendStream(
                       Path.Combine(SessionFactory.Dir, string.Format("{0}.{1}.vec", CollectionId.ToHash(), keyId))))
            {
                foreach (var token in tokens)
                {
                    index.Add(new VectorNode(token, docId), vectorStream);
                    count++;
                }
            }
        }
Пример #21
0
        public static VectorNode DeserializeNode(
            long vecOffset,
            long postingsOffset,
            long componentCount,
            long weight,
            long terminator,
            Stream vectorStream,
            IStringModel tokenizer)
        {
            var vector = tokenizer.DeserializeVector(vecOffset, (int)componentCount, vectorStream);
            var node   = new VectorNode(postingsOffset, vecOffset, terminator, weight, componentCount, vector);

            return(node);
        }
Пример #22
0
        private static void Visualize(VectorNode node, StringBuilder output, int depth)
        {
            if (node == null)
            {
                return;
            }

            output.Append('\t', depth);
            output.AppendFormat($"{node.AngleWhenAdded} {node} w:{node.Weight}");
            output.AppendLine();

            Visualize(node.Left, output, depth + 1);
            Visualize(node.Right, output, depth);
        }
Пример #23
0
        public void CreateColumnSegment(VectorNode column, Stream vectorStream, Stream postingsStream, IStringModel model)
        {
            var time = Stopwatch.StartNew();

            var page = GraphBuilder.SerializeTree(column, _ixStream, vectorStream, postingsStream, model);

            _ixStream.Flush();
            _ixPageIndexWriter.Write(page.offset, page.length);
            _ixPageIndexWriter.Flush();

            var size = PathFinder.Size(column);

            this.Log("serialized column {0} in {1}. weight {2} depth {3} width {4} (avg depth {5})",
                     _keyId, time.Elapsed, column.Weight, size.depth, size.width, size.avgDepth);
        }
Пример #24
0
        public async Task CreateColumnSegment(VectorNode column, Stream vectorStream)
        {
            var time = Stopwatch.StartNew();

            await _postingsWriter.Write(column);

            var page = VectorNodeWriter.SerializeTree(column, _ixStream, vectorStream);

            _ixStream.Flush();
            _ixPageIndexWriter.Write(page.offset, page.length);
            _ixPageIndexWriter.Flush();

            var size = VectorNodeReader.Size(column);

            this.Log("serialized column {0} in {1}. weight {2} depth {3} width {4} (avg depth {5})",
                     _keyId, time.Elapsed, column.Weight, size.depth, size.width, size.avgDepth);
        }
Пример #25
0
        public static void MergePostings(VectorNode target, VectorNode node)
        {
            if (target.PostingsOffsets == null)
            {
                target.PostingsOffsets = new List <long> {
                    target.PostingsOffset
                };
            }

            if (node.PostingsOffsets == null)
            {
                target.PostingsOffsets.Add(node.PostingsOffset);
            }
            else
            {
                ((List <long>)target.PostingsOffsets).AddRange(node.PostingsOffsets);
            }
        }
Пример #26
0
        public void Publish(ulong collectionId, long keyId, VectorNode index)
        {
            lock (_sync)
            {
                var timer = new Stopwatch();
                timer.Start();

                VectorNode clone = null;

                var colIx = GetCollectionIndex(collectionId);

                if (colIx == null)
                {
                    _index.Add(collectionId, keyId, index);
                }
                else
                {
                    if (colIx.ContainsKey(keyId))
                    {
                        clone = colIx[keyId].Clone();
                    }
                    else
                    {
                        colIx[keyId] = index;
                    }
                }

                if (clone != null)
                {
                    using (var vectorStream = CreateAppendStream(
                               Path.Combine(Dir, string.Format("{0}.{1}.vec", collectionId, keyId))))
                    {
                        foreach (var node in index.Right.All())
                        {
                            clone.Add(node, vectorStream);
                        }
                    }

                    _index.Add(collectionId, keyId, clone);
                }

                _log.Log(string.Format("published {0}.{1} in {2}", collectionId, keyId, timer.Elapsed));
            }
        }
Пример #27
0
        public void Concat(VectorNode rootNode)
        {
            var offsets = new Dictionary <long, IList <long> >();
            var all     = rootNode.All();

            foreach (var node in all)
            {
                if (node.PostingsOffsets != null && node.PostingsOffsets.Count > 1)
                {
                    offsets.Add(node.PostingsOffset, node.PostingsOffsets);
                }
            }

            if (offsets.Count == 0)
            {
                return;
            }

            Concat(offsets);
        }
Пример #28
0
        public void Add(ulong collectionId, long keyId, VectorNode index)
        {
            SortedList <long, VectorNode> collection;

            if (!_ix.TryGetValue(collectionId, out collection))
            {
                collection = new SortedList <long, VectorNode>();
                collection.Add(keyId, index);

                _ix.GetOrAdd(collectionId, collection);
            }
            else
            {
                if (!collection.ContainsKey(keyId))
                {
                    collection.Add(keyId, index);
                }
                else
                {
                    collection[keyId] = index;
                }
            }
        }
Пример #29
0
        public static void DeserializeUnorderedFile(
            Stream indexStream,
            Stream vectorStream,
            VectorNode root,
            float identicalAngle,
            float foldAngle,
            IStringModel model)
        {
            var buf  = new byte[VectorNode.BlockSize];
            int read = indexStream.Read(buf);

            while (read == VectorNode.BlockSize)
            {
                var node = DeserializeNode(buf, vectorStream, model);

                if (node.VectorOffset > -1)
                {
                    GraphBuilder.Add(root, node, model);
                }

                read = indexStream.Read(buf);
            }
        }
Пример #30
0
        public static Hit ClosestMatch(VectorNode root, Vector vector, IStringModel model)
        {
            var   best      = root;
            var   cursor    = root;
            float highscore = 0;

            while (cursor != null)
            {
                var angle = model.CosAngle(vector, cursor.Vector);

                if (angle > model.FoldAngle)
                {
                    if (angle > highscore)
                    {
                        highscore = angle;
                        best      = cursor;
                    }

                    cursor = cursor.Left;
                }
                else
                {
                    if (angle > highscore)
                    {
                        highscore = angle;
                        best      = cursor;
                    }
                    cursor = cursor.Right;
                }
            }

            return(new Hit
            {
                Score = highscore,
                Node = best
            });
        }