/// <summary> /// Build the index /// </summary> public virtual void Build(MetricDB db, int num_centers, Random rand) { this.DB = db; var n = this.DB.Count; // randomized has very good performance, even compared with more "intelligent" strategies this.node_list = new List<Node> (num_centers); var subset = RandomSets.GetRandomSubSet (num_centers, this.DB.Count, rand); var H = new HashSet<int> (subset); for (int centerID = 0; centerID < num_centers; ++centerID) { this.node_list.Add (new Node (subset [centerID])); } var IDX = new SAT_Distal (); IDX.Build (new SampleSpace("", db, subset), rand); for (int docID = 0; docID < n; ++docID) { if (docID % 1000 == 0) { Console.WriteLine ("== Vor {0}/{1}, num_centers: {2}, db: {3}", docID + 1, n, num_centers, db.Name); } if (H.Contains(docID)) { continue; } // var near = new Result(1); // for (var centerID = 0; centerID < num_centers; ++centerID) { // var node = this.node_list[centerID]; // var d = this.DB.Dist(this.DB[node.refID], this.DB[docID]); // near.Push(centerID, d); // } // var _near = near.First; var _near = IDX.SearchKNN (this.DB[docID], 1, new Result (1)).First; this.node_list[_near.ObjID].Add(docID, _near.Dist); } }
public virtual void Build(MetricDB db, int numcenters, int lambda, int seed) { this.LC_LIST = new LC[lambda]; LongParallel.For(0, lambda, (int i) => { this.BuildOneClosure(this.LC_LIST, i, db, numcenters, new Random(seed+i)); }); }
public EPListMeanPivots(MetricDB DB, int seed, int num_pivs) { this.Items = null; var pivs = new List<EPivot> (32); var rand = new Random (seed); var n = DB.Count; var idxseq = new DynamicSequentialOrdered (); idxseq.Build (DB, RandomSets.GetIdentity (DB.Count)); var tmp_items = new List<ItemPair> (DB.Count); int next_piv = rand.Next (0, n); for (int i = 0; i < num_pivs; ++i) { var varX = 0.0; double min_diff = double.MaxValue; this.ComputeDistRow (next_piv, idxseq, rand, pivs, tmp_items); for (int objID = 0; objID < this.Items.Length; ++objID) { var u = this.Items [objID]; var diff = Math.Abs (u.Dist - pivs [u.ObjID].mean); if (diff < min_diff) { min_diff = diff; next_piv = objID; } varX += diff * diff / n; } ++i; Console.WriteLine ("XXXXXX i: {0}, variance: {1}", i, varX); } this.Pivs = pivs.ToArray (); Console.WriteLine("Number of pivots per group: {0}", this.Pivs.Length); }
/// <summary> /// Build the index /// </summary> public virtual void Build(MetricDB db, int num_centers, Random rand, SequenceBuilder seq_builder = null) { this.DB = db; var n = this.DB.Count; // randomized has very good performance, even compared with more "intelligent" strategies this.node_list = new List<Node> (num_centers); var subset = RandomSets.GetRandomSubSet (num_centers, this.DB.Count, rand); for (int centerID = 0; centerID < num_centers; ++centerID) { this.node_list.Add (new Node (subset [centerID])); } var H = new HashSet<int> (subset); for (int docID = 0; docID < n; ++docID) { if (docID % 1000 == 0) { Console.WriteLine ("== {0} {1}/{2}, num_centers: {3}, db: {4}", this, docID + 1, n, num_centers, db.Name); } if (H.Contains(docID)) { continue; } var far = new Result(1); for (var centerID = 0; centerID < num_centers; ++centerID) { var node = this.node_list[centerID]; var d = this.DB.Dist(this.DB[node.refID], this.DB[docID]); far.Push(centerID, -d); } var _far = far.First; this.node_list[_far.docid].Add(docID, -_far.dist); } }
public static List<ItemPair> ComputeDistances(MetricDB db, IEnumerable<int> sample, object piv, List<ItemPair> output, out Stats stats, out int min_objID, out int max_objID) { if (output == null) { output = new List<ItemPair>(); } //var L = new Item[this.DOCS.Count]; max_objID = min_objID = -1; stats = default(Stats); stats.min = double.MaxValue; stats.max = 0; double mean = 0; var count = 0; foreach (var objID in sample) { var dist = db.Dist(piv, db[objID]); mean += dist; output.Add (new ItemPair (objID, dist)); if (dist < stats.min) { stats.min = dist; min_objID = objID; } if (dist > stats.max) { stats.max = dist; max_objID = objID; } ++count; } stats.mean = mean / count; double stddev = 0; foreach (var item in output) { var m = item.Dist - stats.mean; stddev += m * m; } stats.stddev = Math.Sqrt(stddev / count); return output; }
public virtual void Build(MetricDB db, int num_pairs, int maxCandidates = -1) { this.DB = db; this.Fingerprints = new BinQ8HammingSpace (1); this.Sample = new SampleSpace("", this.DB, num_pairs * 2); this.MaxCandidates = maxCandidates; var n = this.DB.Count; var A = new byte[n][]; int pc = this.DB.Count / 100 + 1; int advance = 0; var create_one = new Action<int> (delegate(int i) { var fp = this.GetFP(this.DB[i]); A[i] = fp; if (advance % pc == 0) { Console.WriteLine ("DEBUG {0} ({1}/{2}), db: {3}, num_pairs: {4}, timestamp: {5}", this, advance, n, db.Name, num_pairs, DateTime.Now); } advance++; }); ParallelOptions ops = new ParallelOptions(); ops.MaxDegreeOfParallelism = -1; Parallel.For (0, n, create_one); foreach (var fp in A) { this.Fingerprints.Add( fp ); } var s = new Sequential (); s.Build (this.Fingerprints); this.InternalIndex = s; }
/// <summary> /// Build the index /// </summary> public virtual void Build(MetricDB db, int bsize, Random rand) { this.DB = db; var n = this.DB.Count; // randomized has very good performance, even compared with more "intelligent" strategies var dseq = new DynamicSequentialOrdered (); dseq.Build (db, rand); this.NODES = new List<Node> (n / bsize + 1); var L = new List<ItemPair> (n); while (dseq.Count > 0) { if (this.NODES.Count % 100 == 0) { Console.WriteLine ("XXX {0}, bucketSize: {1}, remain {2}/{3}, db: {4}, date-time: {5}", this, bsize, dseq.Count, db.Count, Path.GetFileName(db.Name), DateTime.Now); } var refID = dseq.GetAnyItem (); dseq.Remove (refID); L.Clear (); dseq.ComputeDistances (this.DB[refID], L); var near = new Result(bsize); var far = new Result (1); dseq.AppendKExtremes (near, far, L); var node = new Node (refID); this.NODES.Add (node); dseq.Remove (near); foreach (var p in near) { node.Add(p.ObjID, p.Dist); } } }
public void Build(MetricDB db, int k, Index ref_index) { this.DB = db; this.K = k; this.R = ref_index; int sigma = this.R.DB.Count; this.INVINDEX = new List<List<int>> (sigma); for (int i = 0; i < sigma; ++i) { this.INVINDEX.Add(new List<int>()); } var A = new int[this.DB.Count][]; int count = 0; var compute_one = new Action<int>(delegate(int objID) { var u = this.GetKnr(this.DB[objID], this.K); A[objID] = u; ++count; if (count % 1000 == 0) { Console.WriteLine ("==== {0}/{1} db: {2}, k: {3}", count, this.DB.Count, this.DB.Name, k); } }); ParallelOptions ops = new ParallelOptions(); ops.MaxDegreeOfParallelism = -1; Parallel.ForEach(new ListGen<int>((int i) => i, this.DB.Count), ops, compute_one); for (int objID = 0; objID < this.DB.Count; ++objID) { var u = A[objID]; for (int i = 0; i < this.K; ++i) { this.INVINDEX[u[i]].Add (objID); } } }
/// <summary> /// Build the index /// </summary> public virtual void Build(MetricDB db, Random rand) { this.DB = db; var n = this.DB.Count; var items = RandomSets.GetIdentity (n); this.root = new Node (items, db, rand, false); }
public void Build(MetricDB db, int k, int num_refs, Random rand) { var sample = new SampleSpace("", db, num_refs, rand); var I = new SAT_Distal(); I.Build(sample, rand); this.Build(db, k, I); }
/// <summary> /// Build the index /// </summary> public virtual void Build(MetricDB db, int num_centers, Random rand, SequenceBuilder seq_builder = null) { this.DB = db; var n = this.DB.Count; // randomized has very good performance, even compared with more "intelligent" strategies this.node_list = new List<Node> (num_centers); var subset = RandomSets.GetRandomSubSet (num_centers, this.DB.Count, rand); for (int centerID = 0; centerID < num_centers; ++centerID) { this.node_list.Add (new Node (subset [centerID])); } var H = new HashSet<int> (subset); for (int docID = 0; docID < n; ++docID) { if (H.Contains(docID)) { continue; } var near = new Result(1); var far = new Result(1); for (var centerID = 0; centerID < num_centers; ++centerID) { var node = this.node_list[centerID]; var d = this.DB.Dist(this.DB[node.refID], this.DB[docID]); near.Push(centerID, d); far.Push(centerID, -d); } var _near = near.First; var _far = far.First; this.node_list[_near.ObjID].AddNear(docID, _near.Dist); this.node_list[_far.ObjID].AddFar(docID, -_far.Dist); } }
public virtual void Build(MetricDB original, Index refs, int k) { this.K = k; this.IdxRefs = refs; var n = original.Count; this.Fingerprints = new List<int[]> (n); for (int i = 0; i < n; ++i) { this.Fingerprints.Add (null); } var tasks = Environment.ProcessorCount << 3; int blocksize = n / tasks; int advance = 0; long minElapsedTicks = 20000000; // control the print rate long prevTicks = DateTime.Now.Ticks; long currTicks; var create_block = new Action<int> (delegate(int blockID) { var sp = blockID * blocksize; var ep = Math.Min (n, sp + blocksize); currTicks = DateTime.Now.Ticks; if (advance == 0 || currTicks - prevTicks > minElapsedTicks) { Console.WriteLine ("KnrFP {0} ({1}/{2}), db: {3}, num_refs: {4}, K: {5}, timestamp: {6}", this, advance, n, Path.GetFileName(original.Name), this.IdxRefs.DB.Count, this.K, DateTime.Now); prevTicks = currTicks; } for (; sp < ep; ++sp) { var fp = this.GetFP(original[sp]); this.Fingerprints[sp] = fp; advance++; } }); LongParallel.For (0, 1 + n / blocksize, create_block); Console.WriteLine ("done"); }
public void Build(MetricDB db, Random rand, int num_refs, int K=7, int maxcand=1024, SequenceBuilder seq_builder=null) { var sample = new SampleSpace ("", db, num_refs, rand); var sat = new SAT_Distal (); sat.Build (sample, RandomSets.GetRandom()); this.Build (db, sat, K, maxcand, seq_builder); }
public virtual void Build(MetricDB original, MetricDB pairs) { this.Fingerprints = new BinQ8HammingSpace (1); this.Pairs = pairs; var n = original.Count; var A = new byte[n][]; int blocksize = 1000; int pc = original.Count / 100 + 1; int advance = 0; var create_block = new Action<int> (delegate(int blockID) { var sp = blockID * blocksize; var ep = Math.Min (n, sp + blocksize); for (; sp < ep; ++sp) { var fp = this.GetFP(original[sp]); A[sp] = fp; if (advance % pc == 0) { Console.WriteLine ("DEBUG {0} ({1}/{2}), db: {3}, num_pairs: {4}, timestamp: {5}", this, advance, n, original.Name, this.Pairs.Count/2, DateTime.Now); } advance++; } }); ParallelOptions ops = new ParallelOptions(); ops.MaxDegreeOfParallelism = 1; Parallel.For (0, 1 + n/blocksize, create_block); foreach (var fp in A) { this.Fingerprints.Add( fp ); } }
public static void EstimateQueryStatistics(MetricDB DB, Random rand, int num_queries, int sample_size, out double mean, out double varY, out double qrad) { var n = DB.Count; var N = num_queries * sample_size; mean = 0.0; var square_mean = 0.0; qrad = 0; for (int qID = 0; qID < num_queries; ++qID) { var q = DB[ rand.Next(0, n) ]; var min = double.MaxValue; for (int sampleID = 0; sampleID < sample_size; ++sampleID) { var u = DB[ rand.Next(0, n) ]; var d = DB.Dist(q, u); mean += d / N; square_mean += d * d / N; if (d > 0) { min = Math.Min(min, d); } } qrad = Math.Max (min, qrad); // if (qrad == 0) { // qrad = min; // } else { // qrad = (min + qrad) * 0.5; // } } varY = square_mean - mean * mean; }
public void Build(MetricDB db, ANNISetup setup, int num_indexes, int num_tasks = -1) { // num_build_processors = 1; this.DB = db; var _rows = new ANNI[num_indexes]; LongParallel.For (0, num_indexes, (int i) => { _rows [i] = new ANNI (); _rows [i].InternalBuild (setup, 0, 1.0, db, num_indexes); }, num_tasks); // ParallelOptions ops = new ParallelOptions (); // ops.MaxDegreeOfParallelism = num_processors; // Parallel.For (0, num_indexes, ops, (int i) => { // _rows [i] = new ILC (); // _rows [i].Build (db, num_indexes, pivsel); // }); this.leader = new NANNI(); this.leader.Build(_rows [0]); this.rows = new ANNI[num_indexes - 1]; for (int i = 1; i < num_indexes; ++i) { this.rows[i - 1] = _rows[i]; } }
public virtual void Build(MetricDB db, ANNISetup setup) { // num_build_processors = 1; this.DB = db; var rows = new List<ANNI> (); var pivsel = new PivotSelectorRandom(db.Count, RandomSets.GetRandom()); this.leader = new NANNI(); var ilc = new ANNI(); var cost = ilc.InternalBuild (setup, 0, 1.0, db, 2); this.leader.Build (ilc); int m = this.leader.clusters.Count; double review_prob = cost.SingleCost - m; review_prob /= this.DB.Count; var min_prob = Math.Sqrt (this.DB.Count) / this.DB.Count; while (review_prob > min_prob) { var row = new ANNI (); rows.Add (row); var _cost = row.InternalBuild (setup, m, review_prob, db, 2); var _m = row.ACT.Count; review_prob *= (_cost.SingleCost - _m) / this.DB.Count; } this.rows = rows.ToArray (); }
public void Build(MetricDB db, int num_refs, Random rand) { if (num_refs > 16) { throw new ArgumentOutOfRangeException(String.Format ("num_refs should be smaller than 16, num_refs: {0}", num_refs)); } this.DB = db; int n = db.Count; this.refs = new SampleSpace("", db, num_refs, rand); var G = new long[n]; for (int objID = 0; objID < n; ++objID) { var u = this.DB[objID]; var useq = this.GetHash(u); G[objID] = useq; } this.TABLE = new Dictionary<long, List<int>> (); for (int objID = 0; objID < n; ++objID) { var hash = G[objID]; List<int> L; if (!this.TABLE.TryGetValue(hash, out L)) { L = new List<int>(); this.TABLE.Add(hash, L); } L.Add (objID); } int m = 0; foreach ( var p in this.TABLE ) { m += p.Value.Count; Console.WriteLine ("@@@@> key: {0}, count: {1}", p.Key, p.Value.Count); } Console.WriteLine ("===== @@@ hashes: {0}, n: {1}, m: {2}", this.TABLE.Count, n, m); }
public virtual void Build(MetricDB db, ANNISetup setup, int num_indexes, int num_tasks = -1) { // num_build_processors = 1; this.DB = db; --num_indexes; this.rows = new ANNI[num_indexes]; var pivsel = new PivotSelectorRandom(db.Count, RandomSets.GetRandom()); this.leader = new NANNI(); var ilc = new ANNI(); var cost = ilc.InternalBuild (setup, 0, 1.0, db, 2); this.leader.Build (ilc); int m = this.leader.clusters.Count; double review_prob = cost.SingleCost - m; review_prob /= this.DB.Count; // ParallelOptions ops = new ParallelOptions (); // ops.MaxDegreeOfParallelism = num_tasks; // Parallel.For (0, num_indexes, ops, (int i) => { // this.rows [i] = new ILC (); // this.rows [i].InternalBuild (m, review_prob, db, num_indexes, pivsel); // }); Console.WriteLine ("====> num_indexes: {0}", num_indexes); LongParallel.For (0, num_indexes, (int i) => { this.rows [i] = new ANNI (); this.rows [i].InternalBuild (setup, m, review_prob, db, num_indexes); }, num_tasks); }
public void Build(MetricDB DB, double alpha_stddev, int min_bs, int seed) { DynamicSequential idxDynamic; idxDynamic = new DynamicSequential (seed); idxDynamic.Build (DB); this.pivots_list = new List<int>(); this.pivots_idx = new int[DB.Count]; this.pivots_dist = new float[DB.Count]; // PivotGroup g = new PivotGroup(DB.Count); //Console.WriteLine ("Number of objects: {0}",idxDynamic.DOCS.Count); int I = 0; while(idxDynamic.DOCS.Count > 0){ var pidx = idxDynamic.GetRandom(); object piv = DB[pidx]; idxDynamic.Remove(pidx); this.pivots_list.Add(pidx); this.pivots_dist[pidx] = 0; this.pivots_idx[pidx] = pidx; double mean, stddev; IResult near, far; idxDynamic.SearchExtremesRange(piv, alpha_stddev, min_bs, out near, out far, out mean, out stddev); foreach (var pair in near){ this.pivots_idx[pair.docid] = pidx; this.pivots_dist[pair.docid] = (float)pair.dist; } foreach (var pair in far){ this.pivots_idx[pair.docid] = pidx; this.pivots_dist[pair.docid] = (float)-pair.dist; } if (I % 10 == 0) { Console.WriteLine("--- I {0}> remains: {1}, alpha_stddev: {2}, mean: {3}, stddev: {4}, pivot: {5}", I, idxDynamic.DOCS.Count, alpha_stddev, mean, stddev, pidx); double near_first, near_last, far_first, far_last; if (near.Count == 0) { near_first = -1; near_last = -1; } else { near_first = near.First.dist; near_last = near.Last.dist; } if (far.Count == 0) { far_last = far_first = -1; } else { far_first = -far.Last.dist; far_last = -far.First.dist; } Console.WriteLine("--- +++ first-near: {0}, last-near: {1}, first-far: {2}, last-far: {3}, near-count: {4}, far-count: {5}", near_first, near_last, far_first, far_last, near.Count, far.Count); Console.WriteLine("--- +++ normalized first-near: {0}, last-near: {1}, first-far: {2}, last-far: {3}, mean: {4}, stddev: {5}", near_first/far_last, near_last/far_last, far_first/far_last, far_last/far_last, mean/far_last, stddev/far_last); //} } ++I; idxDynamic.Remove(near); idxDynamic.Remove(far); //Console.WriteLine("Number of objects after: {0}",idxDynamic.DOCS.Count); } Console.WriteLine("Number of pivots per group: {0}", this.pivots_list.Count); }
public override void Load(BinaryReader Input) { base.Load (Input); this.PIVS = SpaceGenericIO.SmartLoad(Input, false); this.DIST = new List<double>[this.PIVS.Count]; for (int i = 0; i < this.PIVS.Count; ++i) { this.DIST[i] = new List<double>(this.DB.Count); PrimitiveIO<double>.LoadVector(Input, this.DB.Count, this.DIST[i]); } }
/// <summary> /// Build the index /// </summary> public virtual void Build(MetricDB db, int m) { this.DB = db; var n = this.DB.Count; var pivsel = new PivotSelectorRandom (n, RandomSets.GetRandom ()); this.nodes = new Node[m]; for (int i = 0; i < m; ++i) { this.nodes [i] = new Node (db, pivsel.NextPivot ()); } }
public virtual void Build(MetricDB db, int numcenters, int lambda, int seed) { var A = new List<Action> (); this.LC_LIST = new LC[lambda]; for (int i = 0; i < lambda; ++i) { A.Add(this.BuildOneClosure(this.LC_LIST, i, db, numcenters, new Random(seed+i))); } var ops = new ParallelOptions(); ops.MaxDegreeOfParallelism = -1; Parallel.ForEach(A, ops, (action) => action()); }
public void Build(MetricDB db, int sample_size, int num_instances, SequenceBuilder seq_builder = null) { this.DB = db; this.lsc_indexes = new LSC[num_instances]; // IPermutation perm = null; for (int i = 0; i < num_instances; ++i) { var lsc = new LSC_H8 (); lsc.Build(db, sample_size, seq_builder); this.lsc_indexes[i] = lsc; } }
public virtual void Build(MetricDB db, int numcenters, int lambda, SequenceBuilder seq_builder = null) { var A = new List<Action>(); this.LC_LIST = new LC_RNN[lambda]; for (int i = 0; i < lambda; ++i) { A.Add(this.BuildOneClosure(this.LC_LIST, i, db, numcenters, seq_builder)); } var ops = new ParallelOptions(); ops.MaxDegreeOfParallelism = -1; Parallel.ForEach(A, ops, (action) => action()); }
public void Build(MetricDB db, int num_indexes, int num_refs_per_instance, int k, int MAXCAND) { var seed = RandomSets.GetRandomInt (); this.DB = db; this.MAXCAND = MAXCAND; this.Indexes = new KnrSeqSearch[num_indexes]; for (int i = 0; i < num_indexes; ++i) { this.Indexes [i] = new KnrSeqSearch(); this.Indexes [i].Build (db, new Random(seed+i), num_refs_per_instance, k, MAXCAND); } }
public void BuildDoublingReferences(MetricDB db, int num_indexes, int smaller_num_refs, int k, int MAXCAND) { var seed = RandomSets.GetRandomInt (); this.DB = db; this.MAXCAND = MAXCAND; this.Indexes = new KnrSeqSearch[num_indexes]; for (int i = 0; i < num_indexes; ++i) { this.Indexes [i] = new KnrSeqSearch(); this.Indexes [i].Build (db, new Random(seed+i), smaller_num_refs << i, k, MAXCAND); } }
public void AssertEqualityDB(MetricDB db0, MetricDB db1) { Console.WriteLine("Checking equality between original and saved databases"); for (int i = 0; i < db0.Count; ++i) { var d = db0.Dist(db0[i], db1[i]); if (d != 0) { throw new Exception("=== ASSERTION ERROR: databases are not identical"); } } Console.WriteLine("OK"); }
public virtual void Build(MetricDB db, int num_trees, Random rand) { this.DB = db; this.forest = new SAT_ApproxSearch[num_trees]; var action_list = new Action[num_trees]; var seed = rand.Next(); for (int i = 0; i < num_trees; ++i) { action_list[i] = this.ClosureBuildOne(i, seed + i); } LongParallel.ForEach(action_list, (a) => a.Invoke()); }
public void Build(MetricDB db, ANNISetup setup, int num_indexes, bool optimizeDistances) { this.DB = db; var k = 1 + setup.ExpectedK; // since we use items from the database as training queries // select the queries to test the construction var qlist = RandomSets.GetRandomSubSet(setup.NumberQueries, this.DB.Count); this.leader = new XNANNI (); this.leader.PartialBuild (db, setup.Selector); this.rows = new TANNI[num_indexes - 1]; for (int i = 0; i < this.rows.Length; ++i) { this.rows [i] = new TANNI (); this.rows [i].PartialBuild (db, setup.Selector); } //int step_width = 128; double currT = long.MaxValue; double prevT = 0; double currD = this.DB.Count; double prevD = 0; int iter = 0; Console.WriteLine("xxxxxxxx BEGIN> db: {0}, indexes: {1}, setup: {2}", Path.GetFileName(this.DB.Name), num_indexes, setup); do { this.PromotePivots(setup); prevT = currT; prevD = currD; currT = DateTime.Now.Ticks; foreach (var qID in qlist) { var q = this.DB[qID]; var res = new Result(k); currD += this.InternalSearchKNN(q, k, res); } currT = DateTime.Now.Ticks - currT; currT /= qlist.Length; currD /= qlist.Length; ++iter; Console.WriteLine ("======> iter: {0}, timestamp: {1}, setup: {2}", iter, DateTime.Now, setup); Console.WriteLine ("------> prevT: {0}, currT: {1}, prevT / currT: {2}", prevT, currT, prevT / currT); Console.WriteLine ("------> prevD: {0}, currD: {1}, prevD / currD: {2}", prevD, currD, prevD / currD); if (optimizeDistances) { if (prevD < currD * (1 + setup.AlphaStop)) { break; } } else { if (prevT < currT * (1 + setup.AlphaStop)) { break; } } } while (true); }