public static void TestDotnerApiBloomFilters(string clientName, string accessKey) { double falsePositiveRate = 0.00001; string baseUri = "https://op-dhs-prod-read-nus.azurewebsites.net/";// "https://op-dhs-sandbox-read.azurewebsites.net/"; // IDocumentHostingService dhsClient = new DocumentHostingServiceClient(new Uri(baseUri), clientName, accessKey); IList <BloomFilter> result = new List <BloomFilter>(); IList <string> assetIds = new List <string>(); Dictionary <string, List <string> > conflicts = new Dictionary <string, List <string> >(StringComparer.OrdinalIgnoreCase); IList <GetDepotResponse> depots = dhsClient.GetAllDepotsBySiteBasePath("docs", "docs.microsoft.com/dotnet/", null, CancellationToken.None).Result; foreach (GetDepotResponse depot in depots) { if (depot.SystemMetadata.GetValueOrDefault <bool>(MetadataConstants.ActiveKey)) { string depotName = depot.DepotName; string continueAt = null; Console.WriteLine($"{depotName} Start."); IEnumerable <GetDocumentResponse> allDocuments = new List <GetDocumentResponse>(); int i = 0; do { for (int retry = 0; retry < 3; i++) { try { GetDocumentsResponse documents = dhsClient.GetDocumentsPaginated(depot.DepotName, "en-us", "live", true, continueAt, null, null, CancellationToken.None).Result; continueAt = documents.ContinueAt; allDocuments = allDocuments.Concat(documents.Documents); break; } catch (Exception e) { Console.WriteLine(e); Console.WriteLine($"Retry for {retry + 1} times"); } } i++; Console.WriteLine($"{i:000} .................."); }while (!string.IsNullOrEmpty(continueAt)); Console.WriteLine($"{depotName} Size: {allDocuments.Count()}."); var bloomFilter = new BloomFilter(allDocuments.Count(), falsePositiveRate); foreach (var document in allDocuments) { bloomFilter.Add(document.AssetId); assetIds.Add(document.AssetId); } Console.WriteLine($"{depotName} Bloom Filter Size: {bloomFilter.BitLength / 1024 / 8} KB."); result.Add(bloomFilter); Console.WriteLine($"{depotName} Done."); } } using (StreamWriter file = new StreamWriter(@"output.json", true)) { file.WriteLine(JsonConvert.SerializeObject(result)); } int onlyOneCount = 0; var conflictHashDict = new Dictionary <int, int>(); foreach (var assetId in assetIds) { var conflictCount = result.Where(r => r.Contains(assetId)).Count(); if (conflictCount == 1) { onlyOneCount++; } else { if (conflictHashDict.ContainsKey(conflictCount)) { conflictHashDict[conflictCount] += 1; } else { conflictHashDict[conflictCount] = 1; } } } Console.WriteLine($"Only one count: {onlyOneCount}, total count: {assetIds.Count()}"); Console.WriteLine($"Duplicate count: {conflictHashDict.Keys.Count}"); var output = conflictHashDict.OrderBy(p => p.Value).Reverse(); using (StreamWriter file = new StreamWriter(@"conflictCount.txt", true)) { foreach (var item in output) { file.WriteLine($"{item.Key}: {item.Value}"); } } Console.ReadLine(); }
public static void GenerateDotnetApiBloomFilter(string clientName, string accessKey) { double falsePositiveRate = 0.00001; string baseUri = "https://op-dhs-prod-read-nus.azurewebsites.net/";// "https://op-dhs-sandbox-read.azurewebsites.net/"; // IDocumentHostingService dhsClient = new DocumentHostingServiceClient(new Uri(baseUri), clientName, accessKey); IList <DepotBloomFilter> result = new List <DepotBloomFilter>(); Dictionary <string, List <string> > conflicts = new Dictionary <string, List <string> >(StringComparer.OrdinalIgnoreCase); IList <GetDepotResponse> depots = dhsClient.GetAllDepotsBySiteBasePath("docs", "docs.microsoft.com/dotnet/", null, CancellationToken.None).Result; foreach (GetDepotResponse depot in depots.Skip(20).Take(1)) { if (depot.SystemMetadata.GetValueOrDefault <bool>(MetadataConstants.ActiveKey)) { string depotName = depot.DepotName; string continueAt = null; Console.WriteLine($"{depotName} Start."); IEnumerable <GetDocumentResponse> allDocuments = new List <GetDocumentResponse>(); int i = 0; do { for (int retry = 0; retry < 3; i++) { try { GetDocumentsResponse documents = dhsClient.GetDocumentsPaginated(depot.DepotName, "en-us", "live", true, continueAt, null, null, CancellationToken.None).Result; continueAt = documents.ContinueAt; allDocuments = allDocuments.Concat(documents.Documents); break; } catch (Exception e) { Console.WriteLine(e); Console.WriteLine($"Retry for {retry + 1} times"); } } i++; Console.WriteLine($"{i:000} .................."); }while (!string.IsNullOrEmpty(continueAt)); Console.WriteLine($"{depotName} Size: {allDocuments.Count()}."); var bloomFilter = new BloomFilter(allDocuments.Count(), falsePositiveRate); foreach (var document in allDocuments) { bloomFilter.Add(document.AssetId); } Console.WriteLine($"{depotName} Bloom Filter Size: {bloomFilter.BitLength / 1024 / 8} KB."); result.Add(new DepotBloomFilter { DepotName = depotName, BloomFilter = bloomFilter.BitArray, Count = allDocuments.Count(), FalsePositiveRate = falsePositiveRate }); Console.WriteLine($"{depotName} Done."); } } /* * using (StreamWriter file = new StreamWriter(@"output.json", true)) * { * file.WriteLine(JsonConvert.SerializeObject(result)); * } */ MemoryStream ms = new MemoryStream(); using (BsonWriter writer = new BsonWriter(ms)) { JsonSerializer serializer = new JsonSerializer(); serializer.Serialize(writer, result); } Console.ReadLine(); }