public void SearchForDuplicates(Action <Guid, IEnumerable <Guid> > onCollision) { var query = new TableQuery { SelectColumns = columns }; TableContinuationToken token = null; var buffers = new List <IdHashBuffer>(); var buffer = new IdHashBuffer(InitialBufferSize); buffers.Add(buffer); var collisionBytes = new byte[4096]; var ms = new MemoryStream(collisionBytes); var sw = new StreamWriter(ms); do { var executeQuerySegmented = table.ExecuteQuerySegmented(query, token); foreach (var dte in executeQuerySegmented.Results) { var id = Guid.Parse(dte.PartitionKey); EntityProperty property; if (dte.Properties.TryGetValue(indexPropertyName, out property)) { var hash = hashingTransformer(property.PropertyAsObject); if (buffer.TryWrite(id, hash) == false) { buffer = new IdHashBuffer(buffer.Size * 2); buffers.Add(buffer); if (buffer.TryWrite(id, hash) == false) { throw new OutOfMemoryException(); } } } } token = executeQuerySegmented.ContinuationToken; } while (token != null); foreach (var b in buffers) { b.Seal(); } for (var i = 0; i < buffers.Count; i++) { var b = buffers[i]; b.FindHashCollisions(buffers.Skip(i), (hash, ids) => { var collisions = ids.Select(id => new TableQuery { FilterString = TableQuery.GenerateFilterCondition("PartitionKey", QueryComparisons.Equal, id.ToString()), SelectColumns = columns }) .Select(q => table.ExecuteQuery(q).SingleOrDefault()) .Where(dte => dte != null && dte.Properties.ContainsKey(indexPropertyName)) .GroupBy(dte => dte.Properties[indexPropertyName].PropertyAsObject, dte => Guid.Parse(dte.PartitionKey), equalityComparer) .Where(g => g.Count() > 1) .ToArray(); foreach (var collision in collisions) { JsonSerializer.Create().Serialize(sw, collision.Key); sw.Flush(); var guid = new Guid(MD5.Create().ComputeHash(collisionBytes, 0, (int)ms.Position)); ms.Position = 0; onCollision(guid, collision); } }); } }
public void SearchForDuplicates(Action<Guid, IEnumerable<Guid>> onCollision) { var query = new TableQuery { SelectColumns = columns }; TableContinuationToken token = null; var buffers = new List<IdHashBuffer>(); var buffer = new IdHashBuffer(InitialBufferSize); buffers.Add(buffer); var collisionBytes = new byte[4096]; var ms = new MemoryStream(collisionBytes); var sw = new StreamWriter(ms); do { var executeQuerySegmented = table.ExecuteQuerySegmented(query, token); foreach (var dte in executeQuerySegmented.Results) { var id = Guid.Parse(dte.PartitionKey); EntityProperty property; if (dte.Properties.TryGetValue(indexPropertyName, out property)) { var hash = hashingTransformer(property.PropertyAsObject); if (buffer.TryWrite(id, hash) == false) { buffer = new IdHashBuffer(buffer.Size*2); buffers.Add(buffer); if (buffer.TryWrite(id, hash) == false) { throw new OutOfMemoryException(); } } } } token = executeQuerySegmented.ContinuationToken; } while (token != null); foreach (var b in buffers) { b.Seal(); } for (var i = 0; i < buffers.Count; i++) { var b = buffers[i]; b.FindHashCollisions(buffers.Skip(i), (hash, ids) => { var collisions = ids.Select(id => new TableQuery { FilterString = TableQuery.GenerateFilterCondition("PartitionKey", QueryComparisons.Equal, id.ToString()), SelectColumns = columns }) .Select(q => table.ExecuteQuery(q).SingleOrDefault()) .Where(dte => dte != null && dte.Properties.ContainsKey(indexPropertyName)) .GroupBy(dte => dte.Properties[indexPropertyName].PropertyAsObject, dte => Guid.Parse(dte.PartitionKey), equalityComparer) .Where(g => g.Count() > 1) .ToArray(); foreach (var collision in collisions) { JsonSerializer.Create().Serialize(sw, collision.Key); sw.Flush(); var guid = new Guid(MD5.Create().ComputeHash(collisionBytes, 0, (int) ms.Position)); ms.Position = 0; onCollision(guid, collision); } }); } }