private void BuildModel(long docId, long keyId, AnalyzedString tokens) { var ix = GetOrCreateIndex(keyId); foreach (var vector in tokens.Embeddings) { if (!VectorNodeWriter.Add(ix, new VectorNode(vector, docId), Similarity.Term)) { _merges++; } } }
public async Task CreateColumnSegment(VectorNode column, Stream vectorStream) { var time = Stopwatch.StartNew(); await _postingsWriter.Write(column); var page = VectorNodeWriter.SerializeTree(column, _ixStream, vectorStream); _ixStream.Flush(); _ixPageIndexWriter.Write(page.offset, page.length); _ixPageIndexWriter.Flush(); var size = VectorNodeReader.Size(column); this.Log("serialized column {0} in {1}. weight {2} depth {3} width {4} (avg depth {5})", _keyId, time.Elapsed, column.Weight, size.depth, size.width, size.avgDepth); }
public async Task Write(VectorNode rootNode) { var timer = Stopwatch.StartNew(); IList <VectorNode> nodes; byte[] payload; // create postings message using (var message = new MemoryStream()) using (var lengths = new MemoryStream()) using (var offsets = new MemoryStream()) using (var documents = new MemoryStream()) { // Write length of word (i.e. length of list of postings) to header stream, // postings offsets to offset stream, // and word itself to documents stream. nodes = VectorNodeWriter.SerializePostings(rootNode, lengths, offsets, documents); if (nodes.Count == 0) { return; } if (nodes.Count != lengths.Length / sizeof(int)) { throw new DataMisalignedException(); } // first word of message is payload count (i.e. num of postings lists) await message.WriteAsync(BitConverter.GetBytes(nodes.Count)); // next are lengths lengths.Position = 0; await lengths.CopyToAsync(message); // then all of the offsets offsets.Position = 0; await offsets.CopyToAsync(message); // last are the document IDs documents.Position = 0; await documents.CopyToAsync(message); var buf = message.ToArray(); var ctime = Stopwatch.StartNew(); var compressed = QuickLZ.compress(buf, 3); this.Log(string.Format("compressing {0} bytes to {1} took {2}", buf.Length, compressed.Length, ctime.Elapsed)); payload = compressed; } this.Log(string.Format("create postings message took {0}", timer.Elapsed)); // send message, recieve list of (remote) file positions, save positions in index. var positions = await Send(payload); if (nodes.Count != positions.Count) { throw new DataMisalignedException(); } timer.Restart(); for (int i = 0; i < nodes.Count; i++) { nodes[i].PostingsOffset = positions[i]; } this.Log(string.Format("record postings offsets took {0}", timer.Elapsed)); }