Exemplo n.º 1
0
        private void BuildModel(long docId, long keyId, AnalyzedString tokens)
        {
            var ix = GetOrCreateIndex(keyId);

            foreach (var vector in tokens.Embeddings)
            {
                if (!VectorNodeWriter.Add(ix, new VectorNode(vector, docId), Similarity.Term))
                {
                    _merges++;
                }
            }
        }
Exemplo n.º 2
0
        public async Task CreateColumnSegment(VectorNode column, Stream vectorStream)
        {
            var time = Stopwatch.StartNew();

            await _postingsWriter.Write(column);

            var page = VectorNodeWriter.SerializeTree(column, _ixStream, vectorStream);

            _ixStream.Flush();
            _ixPageIndexWriter.Write(page.offset, page.length);
            _ixPageIndexWriter.Flush();

            var size = VectorNodeReader.Size(column);

            this.Log("serialized column {0} in {1}. weight {2} depth {3} width {4} (avg depth {5})",
                     _keyId, time.Elapsed, column.Weight, size.depth, size.width, size.avgDepth);
        }
Exemplo n.º 3
0
        public async Task Write(VectorNode rootNode)
        {
            var timer = Stopwatch.StartNew();

            IList <VectorNode> nodes;

            byte[] payload;

            // create postings message

            using (var message = new MemoryStream())
                using (var lengths = new MemoryStream())
                    using (var offsets = new MemoryStream())
                        using (var documents = new MemoryStream())
                        {
                            // Write length of word (i.e. length of list of postings) to header stream,
                            // postings offsets to offset stream,
                            // and word itself to documents stream.
                            nodes = VectorNodeWriter.SerializePostings(rootNode, lengths, offsets, documents);

                            if (nodes.Count == 0)
                            {
                                return;
                            }

                            if (nodes.Count != lengths.Length / sizeof(int))
                            {
                                throw new DataMisalignedException();
                            }

                            // first word of message is payload count (i.e. num of postings lists)
                            await message.WriteAsync(BitConverter.GetBytes(nodes.Count));

                            // next are lengths
                            lengths.Position = 0;
                            await lengths.CopyToAsync(message);

                            // then all of the offsets
                            offsets.Position = 0;
                            await offsets.CopyToAsync(message);

                            // last are the document IDs
                            documents.Position = 0;
                            await documents.CopyToAsync(message);

                            var buf        = message.ToArray();
                            var ctime      = Stopwatch.StartNew();
                            var compressed = QuickLZ.compress(buf, 3);

                            this.Log(string.Format("compressing {0} bytes to {1} took {2}", buf.Length, compressed.Length, ctime.Elapsed));

                            payload = compressed;
                        }

            this.Log(string.Format("create postings message took {0}", timer.Elapsed));

            // send message, recieve list of (remote) file positions, save positions in index.

            var positions = await Send(payload);

            if (nodes.Count != positions.Count)
            {
                throw new DataMisalignedException();
            }

            timer.Restart();

            for (int i = 0; i < nodes.Count; i++)
            {
                nodes[i].PostingsOffset = positions[i];
            }

            this.Log(string.Format("record postings offsets took {0}", timer.Elapsed));
        }