public static void GenerateDotnetApiBloomFilter(string clientName, string accessKey) { double falsePositiveRate = 0.00001; string baseUri = "https://op-dhs-prod-read-nus.azurewebsites.net/";// "https://op-dhs-sandbox-read.azurewebsites.net/"; // IDocumentHostingService dhsClient = new DocumentHostingServiceClient(new Uri(baseUri), clientName, accessKey); IList <DepotBloomFilter> result = new List <DepotBloomFilter>(); Dictionary <string, List <string> > conflicts = new Dictionary <string, List <string> >(StringComparer.OrdinalIgnoreCase); IList <GetDepotResponse> depots = dhsClient.GetAllDepotsBySiteBasePath("docs", "docs.microsoft.com/dotnet/", null, CancellationToken.None).Result; foreach (GetDepotResponse depot in depots.Skip(20).Take(1)) { if (depot.SystemMetadata.GetValueOrDefault <bool>(MetadataConstants.ActiveKey)) { string depotName = depot.DepotName; string continueAt = null; Console.WriteLine($"{depotName} Start."); IEnumerable <GetDocumentResponse> allDocuments = new List <GetDocumentResponse>(); int i = 0; do { for (int retry = 0; retry < 3; i++) { try { GetDocumentsResponse documents = dhsClient.GetDocumentsPaginated(depot.DepotName, "en-us", "live", true, continueAt, null, null, CancellationToken.None).Result; continueAt = documents.ContinueAt; allDocuments = allDocuments.Concat(documents.Documents); break; } catch (Exception e) { Console.WriteLine(e); Console.WriteLine($"Retry for {retry + 1} times"); } } i++; Console.WriteLine($"{i:000} .................."); }while (!string.IsNullOrEmpty(continueAt)); Console.WriteLine($"{depotName} Size: {allDocuments.Count()}."); var bloomFilter = new BloomFilter(allDocuments.Count(), falsePositiveRate); foreach (var document in allDocuments) { bloomFilter.Add(document.AssetId); } Console.WriteLine($"{depotName} Bloom Filter Size: {bloomFilter.BitLength / 1024 / 8} KB."); result.Add(new DepotBloomFilter { DepotName = depotName, BloomFilter = bloomFilter.BitArray, Count = allDocuments.Count(), FalsePositiveRate = falsePositiveRate }); Console.WriteLine($"{depotName} Done."); } } /* * using (StreamWriter file = new StreamWriter(@"output.json", true)) * { * file.WriteLine(JsonConvert.SerializeObject(result)); * } */ MemoryStream ms = new MemoryStream(); using (BsonWriter writer = new BsonWriter(ms)) { JsonSerializer serializer = new JsonSerializer(); serializer.Serialize(writer, result); } Console.ReadLine(); }
public static void TestDotnerApiBloomFilters(string clientName, string accessKey) { double falsePositiveRate = 0.00001; string baseUri = "https://op-dhs-prod-read-nus.azurewebsites.net/";// "https://op-dhs-sandbox-read.azurewebsites.net/"; // IDocumentHostingService dhsClient = new DocumentHostingServiceClient(new Uri(baseUri), clientName, accessKey); IList <BloomFilter> result = new List <BloomFilter>(); IList <string> assetIds = new List <string>(); Dictionary <string, List <string> > conflicts = new Dictionary <string, List <string> >(StringComparer.OrdinalIgnoreCase); IList <GetDepotResponse> depots = dhsClient.GetAllDepotsBySiteBasePath("docs", "docs.microsoft.com/dotnet/", null, CancellationToken.None).Result; foreach (GetDepotResponse depot in depots) { if (depot.SystemMetadata.GetValueOrDefault <bool>(MetadataConstants.ActiveKey)) { string depotName = depot.DepotName; string continueAt = null; Console.WriteLine($"{depotName} Start."); IEnumerable <GetDocumentResponse> allDocuments = new List <GetDocumentResponse>(); int i = 0; do { for (int retry = 0; retry < 3; i++) { try { GetDocumentsResponse documents = dhsClient.GetDocumentsPaginated(depot.DepotName, "en-us", "live", true, continueAt, null, null, CancellationToken.None).Result; continueAt = documents.ContinueAt; allDocuments = allDocuments.Concat(documents.Documents); break; } catch (Exception e) { Console.WriteLine(e); Console.WriteLine($"Retry for {retry + 1} times"); } } i++; Console.WriteLine($"{i:000} .................."); }while (!string.IsNullOrEmpty(continueAt)); Console.WriteLine($"{depotName} Size: {allDocuments.Count()}."); var bloomFilter = new BloomFilter(allDocuments.Count(), falsePositiveRate); foreach (var document in allDocuments) { bloomFilter.Add(document.AssetId); assetIds.Add(document.AssetId); } Console.WriteLine($"{depotName} Bloom Filter Size: {bloomFilter.BitLength / 1024 / 8} KB."); result.Add(bloomFilter); Console.WriteLine($"{depotName} Done."); } } using (StreamWriter file = new StreamWriter(@"output.json", true)) { file.WriteLine(JsonConvert.SerializeObject(result)); } int onlyOneCount = 0; var conflictHashDict = new Dictionary <int, int>(); foreach (var assetId in assetIds) { var conflictCount = result.Where(r => r.Contains(assetId)).Count(); if (conflictCount == 1) { onlyOneCount++; } else { if (conflictHashDict.ContainsKey(conflictCount)) { conflictHashDict[conflictCount] += 1; } else { conflictHashDict[conflictCount] = 1; } } } Console.WriteLine($"Only one count: {onlyOneCount}, total count: {assetIds.Count()}"); Console.WriteLine($"Duplicate count: {conflictHashDict.Keys.Count}"); var output = conflictHashDict.OrderBy(p => p.Value).Reverse(); using (StreamWriter file = new StreamWriter(@"conflictCount.txt", true)) { foreach (var item in output) { file.WriteLine($"{item.Key}: {item.Value}"); } } Console.ReadLine(); }
public static async Task Migrate(string basePath, string branch, string locale, int top = 3) { Console.WriteLine($"Get depots for {basePath}"); var depots = await s_dhsClient.GetAllDepotsBySiteBasePath("Docs", basePath, null, CancellationToken.None); var topDepots = depots.OrderBy(d => d.Priority).Take(top); var activeEtag = Guid.NewGuid().ToString(); await ParallelUtility.ParallelForEach(topDepots, async topDepot => { var continueAt = string.Empty; var documents = new List <GetDocumentResponse>(); var i = 1; do { Console.WriteLine($"Load {1000 * i++} documents for {topDepot.DepotName}"); var documentsResponse = await s_dhsClient.GetDocumentsPaginated(topDepot.DepotName, locale, branch, false, continueAt, null, 1000, CancellationToken.None); documents.AddRange(documentsResponse.Documents); continueAt = documentsResponse.ContinueAt; } while (!string.IsNullOrEmpty(continueAt)); Console.WriteLine($"Convert {documents.Count} documents for {topDepot.DepotName}"); var pageDocs = new ConcurrentBag <Document>(); await ParallelUtility.ParallelForEach(documents, async document => { using (Stream contentStream = await HttpHelper.DownloadAsStream(document.ContentUri, CancellationToken.None)) { var(pageUrl, pageHash) = await Writer.UploadPage(contentStream, document.CombinedMetadata.GetValueOrDefault <bool>("is_dynamic_rendering"), document.CombinedMetadata.GetValueOrDefault <string>("content_type")); var pageDoc = new Document { Docset = topDepot.DepotName, Url = $"{basePath}{document.AssetId}", Locale = locale, Branch = branch, Monikers = document.CombinedMetadata.GetValueOrDefault <JArray>("monikers")?.ToObject <List <string> >(), ActiveEtag = activeEtag, PageHash = pageHash, PageUrl = pageUrl, PageType = document.CombinedMetadata.GetValueOrDefault <string>("page_type"), Title = document.CombinedMetadata.GetValueOrDefault <string>("title"), Layout = document.CombinedMetadata.GetValueOrDefault <string>("layout"), IsDynamicRendering = document.CombinedMetadata.GetValueOrDefault <bool>("is_dynamic_rendering"), ContentType = document.CombinedMetadata.GetValueOrDefault <string>("content_type") }; pageDocs.Add(pageDoc); } }, 400, 200, (done, total) => { var percent = ((int)(100 * Math.Min(1.0, done / Math.Max(1.0, total)))).ToString(); Console.WriteLine($"Uploading Page Content for {topDepot.DepotName}: {percent.PadLeft(3)}% {done}/{total}"); }); await Writer.UploadDocuments(pageDocs.ToList(), activeEtag, (done, total) => { var percent = ((int)(100 * Math.Min(1.0, done / Math.Max(1.0, total)))).ToString(); Console.WriteLine($"Uploading Page Document for {topDepot.DepotName}: {percent.PadLeft(3)}% {done}/{total}"); }); }, 10, 5); }