private void CreateImagesJsonFiles(List <Feature> features, ZipOutputStream zipStream) { var items = new ConcurrentBag <ImageItem>(); var downloadedUrls = _imagesRepository.GetAllUrls().Result.ToHashSet(); _logger.LogInformation($"Staring Image file creation: {features.Count} features, exiting images: {downloadedUrls.Count}"); Parallel.ForEach(features, new ParallelOptions { MaxDegreeOfParallelism = 10 }, (feature) => { var urls = feature.Attributes.GetNames() .Where(n => n.StartsWith(FeatureAttributes.IMAGE_URL)).Select(n => feature.Attributes[n].ToString()) .Where(u => !string.IsNullOrWhiteSpace(u)); foreach (var url in urls) { if (!downloadedUrls.Contains(url)) { _logger.LogWarning("The following image does not exist in database: " + url + " feature: " + feature.GetId()); continue; } items.Add(_imagesRepository.GetImageByUrl(url).Result); } }); var list = items.ToList(); var index = 0; while (list.Count > 0) { var imageItemsString = JsonConvert.SerializeObject(list.Take(1000).ToList(), new JsonSerializerSettings { ContractResolver = new CamelCasePropertyNamesContractResolver() }); var newEntry = new ZipEntry($"images/images{index:000}.json") { DateTime = DateTime.Now }; zipStream.PutNextEntry(newEntry); StreamUtils.Copy(new MemoryStream(Encoding.UTF8.GetBytes(imageItemsString)), zipStream, new byte[4096]); zipStream.CloseEntry(); list = list.Skip(1000).ToList(); index++; } _logger.LogInformation("Finished Image file creation: " + items.Count()); }
/// <inheritdoc/> public async Task DownloadAndStoreUrls(List <string> imagesUrls) { var exitingUrls = await _imagesRepository.GetAllUrls(); var needToRemove = exitingUrls.Except(imagesUrls).ToList(); _logger.LogInformation($"Need to remove {needToRemove.Count} images that are no longer relevant"); foreach (var imageUrlToRemove in needToRemove) { await _imagesRepository.DeleteImageByUrl(imageUrlToRemove); } _logger.LogInformation($"Finished removing images, starting downloading and index: {imagesUrls.Count}"); using (var md5 = MD5.Create()) { var counter = 0; Parallel.ForEach(imagesUrls, new ParallelOptions { MaxDegreeOfParallelism = 20 }, (imageUrl) => { try { Interlocked.Increment(ref counter); if (counter % 100 == 0) { _logger.LogInformation($"Indexed {counter} images of {imagesUrls.Count}"); } if (exitingUrls.Contains(imageUrl)) { var size = _remoteFileFetcherGateway.GetFileSize(imageUrl).Result; if (size > 0) { return; } } var content = new byte[0]; for (int retryIndex = 0; retryIndex < 3; retryIndex++) { try { content = _remoteFileFetcherGateway.GetFileContent(imageUrl).Result.Content; break; } catch { Task.Delay(200).Wait(); } } if (content.Length == 0) { _imagesRepository.DeleteImageByUrl(imageUrl).Wait(); return; } StoreImage(md5, content, imageUrl).Wait(); } catch (Exception ex) { _logger.LogWarning(ex, "There was a problem with the following image url: " + imageUrl + " "); } }); } }