public void Add(ulong collectionId, long keyId, VectorNode index) { SortedList <long, VectorNode> ix; if (!_ix.TryGetValue(collectionId, out ix)) { lock (_sync) { if (!_ix.TryGetValue(collectionId, out ix)) { ix = new SortedList <long, VectorNode>(); _ix.Add(collectionId, ix); } } } if (!ix.ContainsKey(keyId)) { lock (_sync) { if (!ix.ContainsKey(keyId)) { ix.Add(keyId, index); } } } }
public BuildJob(ulong collectionId, ulong docId, IEnumerable <string> tokens, VectorNode index) { CollectionId = collectionId; DocId = docId; Tokens = tokens; Index = index; }
public static void SerializeNode(VectorNode node, Stream stream) { long terminator = 1; if (node.Left == null && node.Right == null) // there are no children { terminator = 3; } else if (node.Left == null) // there is a right but no left { terminator = 2; } else if (node.Right == null) // there is a left but no right { terminator = 1; } else // there is a left and a right { terminator = 0; } stream.Write(BitConverter.GetBytes(node.VectorOffset)); stream.Write(BitConverter.GetBytes(node.PostingsOffset)); stream.Write(BitConverter.GetBytes((long)node.Vector.Count)); stream.Write(BitConverter.GetBytes(node.Weight)); stream.Write(BitConverter.GetBytes(terminator)); }
public static (int depth, int width, int avgDepth) Size(VectorNode root) { var width = 0; var depth = 1; var node = root; var aggDepth = 0; var count = 0; while (node != null) { var d = Depth(node); if (d > depth) { depth = d; } aggDepth += d; count++; width++; node = node.Right; } return(depth, width, aggDepth / count); }
public static (long offset, long length) SerializeTree( VectorNode node, Stream indexStream, Stream vectorStream, Stream postingsStream, IStringModel tokenizer) { var stack = new Stack <VectorNode>(); var offset = indexStream.Position; if (node.Vector.Count == 0) { node = node.Right; } while (node != null) { SerializePostings(node, postingsStream); node.VectorOffset = tokenizer.SerializeVector(node.Vector, vectorStream); SerializeNode(node, indexStream); if (node.Right != null) { stack.Push(node.Right); } node = node.Left; if (node == null && stack.Count > 0) { node = stack.Pop(); } } var length = indexStream.Position - offset; return(offset, length); }
public static string Visualize(VectorNode root) { StringBuilder output = new StringBuilder(); Visualize(root, output, 0); return(output.ToString()); }
public async Task SerializeColumnSegment(VectorNode column) { var time = Stopwatch.StartNew(); if (_postingsWriter != null) { await _postingsWriter.Write(column); } lock (_indexFileSync) { var page = column.SerializeTree(_ixStream); _ixStream.Flush(); _pageIndexWriter.Write(page.offset, page.length); _pageIndexWriter.Flush(); } var size = column.Size(); this.Log("serialized column {0} in {1}. weight {2} depth {3} width {4} (avg depth {5})", _keyId, time.Elapsed, column.Weight, size.depth, size.width, size.avgDepth); }
public static IEnumerable <VectorNode> All(VectorNode root) { var node = root; var stack = new Stack <VectorNode>(); while (node != null) { yield return(node); if (node.Right != null) { stack.Push(node.Right); } node = node.Left; if (node == null) { if (stack.Count > 0) { node = stack.Pop(); } } } }
public static Hit FindFirstNonSimilar(VectorNode root, SortedList <long, int> vector, float foldAngle) { var cursor = root; while (cursor != null) { var angle = vector.CosAngle(cursor.Vector); if (angle < foldAngle) { return(new Hit { Score = angle, Node = cursor }); } else if (cursor.Right != null) { cursor = cursor.Right; } else { cursor = cursor.Left; } } return(new Hit()); }
private static VectorTree DeserializeTree(string dir) { var ix = new SortedList <ulong, SortedList <uint, VectorNode> >(); foreach (var ixFileName in Directory.GetFiles(dir, "*.ix")) { var name = Path.GetFileNameWithoutExtension(ixFileName).Split(".", StringSplitOptions.RemoveEmptyEntries); var colHash = ulong.Parse(name[0]); var keyId = uint.Parse(name[1]); SortedList <uint, VectorNode> colIx; if (!ix.TryGetValue(colHash, out colIx)) { colIx = new SortedList <uint, VectorNode>(); ix.Add(colHash, colIx); } using (var treeStream = File.OpenRead(ixFileName)) using (var vecStream = File.OpenRead(Path.Combine(dir, string.Format("{0}.vec", colHash)))) { var root = VectorNode.Deserialize(treeStream, vecStream); ix[colHash].Add(keyId, root); } } return(new VectorTree(ix)); }
public VectorNode DeserializeIndex(string ixFileName, string vecFileName) { using (var treeStream = new FileStream(ixFileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (var vecStream = new FileStream(vecFileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { return(VectorNode.Deserialize(treeStream, vecStream)); } }
private async Task <VectorNode> DeserializeIndex(string ixFileName, string vecFileName) { using (var treeStream = new FileStream(ixFileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite, 4096, true)) using (var vecStream = new FileStream(vecFileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite, 4096, true)) { return(await VectorNode.Deserialize(treeStream, vecStream)); } }
public static void SerializePostings(VectorNode node, Stream postingsStream) { var offset = postingsStream.Position; postingsStream.Write(node.DocIds.ToStreamWithHeader(node.DocIds.Count)); node.PostingsOffset = offset; }
private async Task SerializeColumn(long keyId, VectorNode column) { using (var columnWriter = new ColumnSerializer( CollectionId, keyId, SessionFactory, ixFileExtension: "ixo", pageFileExtension: "ixop")) { await columnWriter.SerializeColumnSegment(column); } }
public static int Depth(VectorNode node) { var count = 0; while (node != null) { count++; node = node.Left; } return(count); }
public static SortedList <long, int> Compress(VectorNode root) { var vector = new SortedList <long, int>(); foreach (var node in All(root)) { vector = VectorOperations.Merge(vector, node.Vector); } return(vector); }
public static Vector Compress(VectorNode root) { var vector = new Vector(new int[0]); foreach (var node in PathFinder.All(root)) { vector = vector.Add(node.Vector); } return(vector); }
public static void MergeDocIds(VectorNode target, VectorNode node) { if (target.DocIds == null || node.DocIds == null) { return; } foreach (var docId in node.DocIds) { target.DocIds.Add(docId); } }
public bool TryGetIndex(ulong collectionId, long keyId, out VectorNode index) { var colIndex = _index.GetIndex(collectionId); if (colIndex != null) { return(colIndex.TryGetValue(keyId, out index)); } index = null; return(false); }
private void BuildInMemoryIndex(ulong docId, long keyId, VectorNode index, IEnumerable <string> tokens) { var count = 0; using (var vectorStream = SessionFactory.CreateAppendStream( Path.Combine(SessionFactory.Dir, string.Format("{0}.{1}.vec", CollectionId.ToHash(), keyId)))) { foreach (var token in tokens) { index.Add(new VectorNode(token, docId), vectorStream); count++; } } }
public static VectorNode DeserializeNode( long vecOffset, long postingsOffset, long componentCount, long weight, long terminator, Stream vectorStream, IStringModel tokenizer) { var vector = tokenizer.DeserializeVector(vecOffset, (int)componentCount, vectorStream); var node = new VectorNode(postingsOffset, vecOffset, terminator, weight, componentCount, vector); return(node); }
private static void Visualize(VectorNode node, StringBuilder output, int depth) { if (node == null) { return; } output.Append('\t', depth); output.AppendFormat($"{node.AngleWhenAdded} {node} w:{node.Weight}"); output.AppendLine(); Visualize(node.Left, output, depth + 1); Visualize(node.Right, output, depth); }
public void CreateColumnSegment(VectorNode column, Stream vectorStream, Stream postingsStream, IStringModel model) { var time = Stopwatch.StartNew(); var page = GraphBuilder.SerializeTree(column, _ixStream, vectorStream, postingsStream, model); _ixStream.Flush(); _ixPageIndexWriter.Write(page.offset, page.length); _ixPageIndexWriter.Flush(); var size = PathFinder.Size(column); this.Log("serialized column {0} in {1}. weight {2} depth {3} width {4} (avg depth {5})", _keyId, time.Elapsed, column.Weight, size.depth, size.width, size.avgDepth); }
public async Task CreateColumnSegment(VectorNode column, Stream vectorStream) { var time = Stopwatch.StartNew(); await _postingsWriter.Write(column); var page = VectorNodeWriter.SerializeTree(column, _ixStream, vectorStream); _ixStream.Flush(); _ixPageIndexWriter.Write(page.offset, page.length); _ixPageIndexWriter.Flush(); var size = VectorNodeReader.Size(column); this.Log("serialized column {0} in {1}. weight {2} depth {3} width {4} (avg depth {5})", _keyId, time.Elapsed, column.Weight, size.depth, size.width, size.avgDepth); }
public static void MergePostings(VectorNode target, VectorNode node) { if (target.PostingsOffsets == null) { target.PostingsOffsets = new List <long> { target.PostingsOffset }; } if (node.PostingsOffsets == null) { target.PostingsOffsets.Add(node.PostingsOffset); } else { ((List <long>)target.PostingsOffsets).AddRange(node.PostingsOffsets); } }
public void Publish(ulong collectionId, long keyId, VectorNode index) { lock (_sync) { var timer = new Stopwatch(); timer.Start(); VectorNode clone = null; var colIx = GetCollectionIndex(collectionId); if (colIx == null) { _index.Add(collectionId, keyId, index); } else { if (colIx.ContainsKey(keyId)) { clone = colIx[keyId].Clone(); } else { colIx[keyId] = index; } } if (clone != null) { using (var vectorStream = CreateAppendStream( Path.Combine(Dir, string.Format("{0}.{1}.vec", collectionId, keyId)))) { foreach (var node in index.Right.All()) { clone.Add(node, vectorStream); } } _index.Add(collectionId, keyId, clone); } _log.Log(string.Format("published {0}.{1} in {2}", collectionId, keyId, timer.Elapsed)); } }
public void Concat(VectorNode rootNode) { var offsets = new Dictionary <long, IList <long> >(); var all = rootNode.All(); foreach (var node in all) { if (node.PostingsOffsets != null && node.PostingsOffsets.Count > 1) { offsets.Add(node.PostingsOffset, node.PostingsOffsets); } } if (offsets.Count == 0) { return; } Concat(offsets); }
public void Add(ulong collectionId, long keyId, VectorNode index) { SortedList <long, VectorNode> collection; if (!_ix.TryGetValue(collectionId, out collection)) { collection = new SortedList <long, VectorNode>(); collection.Add(keyId, index); _ix.GetOrAdd(collectionId, collection); } else { if (!collection.ContainsKey(keyId)) { collection.Add(keyId, index); } else { collection[keyId] = index; } } }
public static void DeserializeUnorderedFile( Stream indexStream, Stream vectorStream, VectorNode root, float identicalAngle, float foldAngle, IStringModel model) { var buf = new byte[VectorNode.BlockSize]; int read = indexStream.Read(buf); while (read == VectorNode.BlockSize) { var node = DeserializeNode(buf, vectorStream, model); if (node.VectorOffset > -1) { GraphBuilder.Add(root, node, model); } read = indexStream.Read(buf); } }
public static Hit ClosestMatch(VectorNode root, Vector vector, IStringModel model) { var best = root; var cursor = root; float highscore = 0; while (cursor != null) { var angle = model.CosAngle(vector, cursor.Vector); if (angle > model.FoldAngle) { if (angle > highscore) { highscore = angle; best = cursor; } cursor = cursor.Left; } else { if (angle > highscore) { highscore = angle; best = cursor; } cursor = cursor.Right; } } return(new Hit { Score = highscore, Node = best }); }