/// <summary> /// from http://cudl.lib.cam.ac.uk /// </summary> /// <param name="resourceNumber">MS-RAS-00258</param> /// <param name="friendlyUrl"></param> /// <returns></returns> private async Task <RServiceResult <bool> > StartImportingFromCambridge(string resourceNumber, string friendlyUrl) { string url = $"http://cudl.lib.cam.ac.uk/view/{resourceNumber}.json"; if ( ( await _context.ImportJobs .Where(j => j.JobType == JobType.Cambridge && j.ResourceNumber == resourceNumber && !(j.Status == ImportJobStatus.Failed || j.Status == ImportJobStatus.Aborted)) .SingleOrDefaultAsync() ) != null ) { return(new RServiceResult <bool>(false, $"Job is already scheduled or running for importing {url}")); } if (string.IsNullOrEmpty(friendlyUrl)) { friendlyUrl = resourceNumber; } if ( (await _context.Artifacts.Where(a => a.FriendlyUrl == friendlyUrl).SingleOrDefaultAsync()) != null ) { return(new RServiceResult <bool>(false, $"duplicated artifact friendly url '{friendlyUrl}'")); } ImportJob job = new ImportJob() { JobType = JobType.Cambridge, ResourceNumber = resourceNumber, FriendlyUrl = friendlyUrl, SrcUrl = url, QueueTime = DateTime.Now, ProgressPercent = 0, Status = ImportJobStatus.NotStarted }; await _context.ImportJobs.AddAsync ( job ); await _context.SaveChangesAsync(); try { _backgroundTaskQueue.QueueBackgroundWorkItem ( async token => { try { using (var client = new HttpClient()) { using (var result = await client.GetAsync(url)) { if (result.IsSuccessStatusCode) { using (RMuseumDbContext context = new RMuseumDbContext(Configuration)) { if ( (await context.Artifacts.Where(a => a.FriendlyUrl == job.FriendlyUrl).SingleOrDefaultAsync()) != null ) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "aborted because of duplicated friendly url"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } RArtifactMasterRecord book = new RArtifactMasterRecord($"extracted from {url}", $"extracted from {url}") { Status = PublishStatus.Draft, DateTime = DateTime.Now, LastModified = DateTime.Now, CoverItemIndex = 0, FriendlyUrl = friendlyUrl }; List <RTagValue> meta = new List <RTagValue>(); RTagValue tag; string json = await result.Content.ReadAsStringAsync(); var parsed = JObject.Parse(json); book.Name = book.NameInEnglish = parsed.SelectToken("logicalStructures[*].label").Value <string>(); book.Description = book.DescriptionInEnglish = Regex.Replace( parsed.SelectToken("descriptiveMetadata[*].abstract.displayForm").Value <string>(), "<.*?>", string.Empty); int tagOrder = 1; foreach (JToken descriptiveMetadata in parsed.SelectTokens("$.descriptiveMetadata[*]").Children()) { foreach (JToken child in descriptiveMetadata.Children()) { if (child.SelectToken("label") != null && child.SelectToken("display") != null) { if (child.SelectToken("display").Value <string>() == "True") { string metaName = child.SelectToken("label").Value <string>(); string metaValue = ""; if (child.SelectToken("displayForm") != null) { metaValue = Regex.Replace( child.SelectToken("displayForm").Value <string>(), "<.*?>", string.Empty); tag = await TagHandler.PrepareAttribute(context, metaName, metaValue, tagOrder++); meta.Add(tag); } else if (child.SelectToken("value") != null) { foreach (JToken value in child.SelectTokens("value").Children()) { if (value.SelectToken("displayForm") != null) { metaValue = Regex.Replace( value.SelectToken("displayForm").Value <string>(), "<.*?>", string.Empty); tag = await TagHandler.PrepareAttribute(context, metaName, metaValue, tagOrder++); meta.Add(tag); } } } } } } } string imageReproPageURL = "https://image01.cudl.lib.cam.ac.uk"; using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.StartTime = DateTime.Now; job.Status = ImportJobStatus.Running; job.SrcContent = json; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } tag = await TagHandler.PrepareAttribute(context, "Type", "Book", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Manuscript", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Source", "University of Cambridge Digital Library", 1); string viewerUrl = $"http://cudl.lib.cam.ac.uk/view/{resourceNumber}"; tag.ValueSupplement = viewerUrl; meta.Add(tag); book.Tags = meta.ToArray(); List <RArtifactItemRecord> pages = new List <RArtifactItemRecord>(); int order = 0; foreach (JToken pageToken in parsed.SelectTokens("$.pages").Children()) { order++; using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.ProgressPercent = order; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } string imageUrl = imageReproPageURL + pageToken.SelectToken("downloadImageURL").Value <string>(); RArtifactItemRecord page = new RArtifactItemRecord() { Name = $"تصویر {order}", NameInEnglish = $"Image {order}", Description = "", DescriptionInEnglish = "", Order = order, FriendlyUrl = $"p{$"{order}".PadLeft(4, '0')}", LastModified = DateTime.Now }; List <RTagValue> pageMata = new List <RTagValue>(); tag = await TagHandler.PrepareAttribute(context, "Source", "University of Cambridge Digital Library", 1); tag.ValueSupplement = $"{viewerUrl}/{order}"; pageMata.Add(tag); if (pageToken.SelectToken("label") != null) { tag = await TagHandler.PrepareAttribute(context, "Label", pageToken.SelectToken("label").Value <string>(), 1); pageMata.Add(tag); } page.Tags = pageMata.ToArray(); if (!string.IsNullOrEmpty(imageUrl)) { bool recovered = false; if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { RServiceResult <RPictureFile> picture = await _pictureFileService.RecoverFromeFiles(page.Name, page.Description, 1, imageUrl, Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg"), $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result != null) { recovered = true; page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } if (!recovered) { if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ); } var imageResult = await client.GetAsync(imageUrl); if (imageResult.StatusCode == HttpStatusCode.NotFound) { break; //finished } int _ImportRetryCount = 200; int _ImportRetryInitialSleep = 500; int retryCount = 0; while (retryCount < _ImportRetryCount && !imageResult.IsSuccessStatusCode && imageResult.StatusCode == HttpStatusCode.ServiceUnavailable) { imageResult.Dispose(); Thread.Sleep(_ImportRetryInitialSleep * (retryCount + 1)); imageResult = await client.GetAsync(imageUrl); retryCount++; } if (imageResult.IsSuccessStatusCode) { using (Stream imageStream = await imageResult.Content.ReadAsStreamAsync()) { RServiceResult <RPictureFile> picture = await _pictureFileService.Add(page.Name, page.Description, 1, null, imageUrl, imageStream, $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result == null) { throw new Exception($"_pictureFileService.Add : {picture.ExceptionString}"); } page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({imageResult.StatusCode}) for page {order}, url {imageUrl}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } imageResult.Dispose(); return; } imageResult.Dispose(); GC.Collect(); } } } book.Items = pages.ToArray(); book.ItemCount = pages.Count; if (pages.Count == 0) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "Pages.Count == 0"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } await context.Artifacts.AddAsync(book); await context.SaveChangesAsync(); job.ProgressPercent = 100; job.Status = ImportJobStatus.Succeeded; job.ArtifactId = book.Id; job.EndTime = DateTime.Now; context.Update(job); await context.SaveChangesAsync(); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({result.StatusCode}) for {url}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } } } } catch (Exception exp) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = exp.ToString(); importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } } } ); return(new RServiceResult <bool>(true)); } catch (Exception exp) { return(new RServiceResult <bool>(false, exp.ToString())); } }
/// <summary> /// import from server folder /// </summary> /// <param name="folderPath">C:\Tools\batches\florence</param> /// <param name="friendlyUrl">shahname-florence</param> /// <param name="srcUrl">https://t.me/dr_khatibi_abolfazl/888</param> /// <returns></returns> private async Task <RServiceResult <bool> > StartImportingFromServerFolder(string folderPath, string friendlyUrl, string srcUrl) { try { if ( ( await _context.ImportJobs .Where(j => j.JobType == JobType.ServerFolder && j.ResourceNumber == folderPath && !(j.Status == ImportJobStatus.Failed || j.Status == ImportJobStatus.Aborted)) .SingleOrDefaultAsync() ) != null ) { return(new RServiceResult <bool>(false, $"Job is already scheduled or running for importing server folder {folderPath}")); } if (string.IsNullOrEmpty(friendlyUrl)) { return(new RServiceResult <bool>(false, $"Friendly url is empty, server folder {folderPath}")); } if ( (await _context.Artifacts.Where(a => a.FriendlyUrl == friendlyUrl).SingleOrDefaultAsync()) != null ) { return(new RServiceResult <bool>(false, $"duplicated friendly url '{friendlyUrl}'")); } ImportJob job = new ImportJob() { JobType = JobType.ServerFolder, ResourceNumber = folderPath, FriendlyUrl = friendlyUrl, SrcUrl = srcUrl, QueueTime = DateTime.Now, ProgressPercent = 0, Status = ImportJobStatus.NotStarted }; await _context.ImportJobs.AddAsync ( job ); await _context.SaveChangesAsync(); _backgroundTaskQueue.QueueBackgroundWorkItem ( async token => { try { using (RMuseumDbContext context = new RMuseumDbContext(Configuration)) { RArtifactMasterRecord book = new RArtifactMasterRecord($"extracted from server folder {job.ResourceNumber}", $"extracted from server folder {job.ResourceNumber}") { Status = PublishStatus.Draft, DateTime = DateTime.Now, LastModified = DateTime.Now, CoverItemIndex = 0, FriendlyUrl = friendlyUrl }; List <RTagValue> meta = new List <RTagValue>(); RTagValue tag; tag = await TagHandler.PrepareAttribute(context, "Type", "Book", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Manuscript", 1); meta.Add(tag); meta.Add(tag); using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.StartTime = DateTime.Now; job.Status = ImportJobStatus.Running; job.SrcContent = ""; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } List <RArtifactItemRecord> pages = new List <RArtifactItemRecord>(); string[] fileNames = Directory.GetFiles(job.ResourceNumber, "*.jpg"); int order = 0; foreach (string fileName in fileNames) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.ProgressPercent = order * 100 / (decimal)fileNames.Length; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } order++; RArtifactItemRecord page = new RArtifactItemRecord() { Name = $"تصویر {order}", NameInEnglish = $"Image {order} of {book.NameInEnglish}", Description = "", DescriptionInEnglish = "", Order = order, FriendlyUrl = $"p{$"{order}".PadLeft(4, '0')}", LastModified = DateTime.Now }; page.Tags = new RTagValue[] { }; if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ); } using (FileStream imageStream = new FileStream(fileName, FileMode.Open)) { RServiceResult <RPictureFile> picture = await _pictureFileService.Add(page.Name, page.Description, 1, null, job.SrcUrl, imageStream, $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result == null) { throw new Exception($"_pictureFileService.Add : {picture.ExceptionString}"); } page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } } pages.Add(page); } book.Tags = meta.ToArray(); book.Items = pages.ToArray(); book.ItemCount = pages.Count; if (pages.Count == 0) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "Pages.Count == 0"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } await context.Artifacts.AddAsync(book); await context.SaveChangesAsync(); job.ProgressPercent = 100; job.Status = ImportJobStatus.Succeeded; job.ArtifactId = book.Id; job.EndTime = DateTime.Now; context.Update(job); await context.SaveChangesAsync(); } } catch (Exception exp) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = exp.ToString(); importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } } } ); return(new RServiceResult <bool>(true)); } catch (Exception exp) { return(new RServiceResult <bool>(false, exp.ToString())); } }
/// <summary> /// from http://www.thedigitalwalters.org/01_ACCESS_WALTERS_MANUSCRIPTS.html /// </summary> /// <param name="resourceNumber">W619</param> /// <param name="friendlyUrl">golestan-walters-01</param> /// <returns></returns> private async Task <RServiceResult <bool> > StartImportingFromWalters(string resourceNumber, string friendlyUrl) { string url = $"http://www.thedigitalwalters.org/Data/WaltersManuscripts/ManuscriptDescriptions/{resourceNumber}_tei.xml"; if ( ( await _context.ImportJobs .Where(j => j.JobType == JobType.Walters && j.ResourceNumber == resourceNumber && !(j.Status == ImportJobStatus.Failed || j.Status == ImportJobStatus.Aborted)) .SingleOrDefaultAsync() ) != null ) { return(new RServiceResult <bool>(false, $"Job is already scheduled or running for importing {url}")); } if (string.IsNullOrEmpty(friendlyUrl)) { friendlyUrl = resourceNumber; } if ( (await _context.Artifacts.Where(a => a.FriendlyUrl == friendlyUrl).SingleOrDefaultAsync()) != null ) { return(new RServiceResult <bool>(false, $"duplicated friendly url '{friendlyUrl}'")); } ImportJob job = new ImportJob() { JobType = JobType.Walters, ResourceNumber = resourceNumber, FriendlyUrl = friendlyUrl, SrcUrl = url, QueueTime = DateTime.Now, ProgressPercent = 0, Status = ImportJobStatus.NotStarted }; await _context.ImportJobs.AddAsync ( job ); await _context.SaveChangesAsync(); try { _backgroundTaskQueue.QueueBackgroundWorkItem ( async token => { try { using (var client = new HttpClient()) { using (var result = await client.GetAsync(url)) { if (result.IsSuccessStatusCode) { using (RMuseumDbContext context = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { RArtifactMasterRecord book = new RArtifactMasterRecord($"extracted from {url}", $"extracted from {url}") { Status = PublishStatus.Draft, DateTime = DateTime.Now, LastModified = DateTime.Now, CoverItemIndex = 0, FriendlyUrl = friendlyUrl }; List <RTagValue> meta = new List <RTagValue>(); RTagValue tag; string xml = await result.Content.ReadAsStringAsync(); using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.StartTime = DateTime.Now; job.Status = ImportJobStatus.Running; job.SrcContent = xml; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } XElement elObject = XDocument.Parse(xml).Root; tag = await TagHandler.PrepareAttribute(context, "Type", "Book", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Manuscript", 1); meta.Add(tag); try { foreach (var prop in elObject .Elements("{http://www.tei-c.org/ns/1.0}teiHeader").First() .Elements("{http://www.tei-c.org/ns/1.0}fileDesc").First() .Elements("{http://www.tei-c.org/ns/1.0}titleStmt").First() .Elements("{http://www.tei-c.org/ns/1.0}title")) { string label = prop.Value; book.Name = book.NameInEnglish = book.Description = book.DescriptionInEnglish = label; tag = await TagHandler.PrepareAttribute(context, "Title", label, 1); meta.Add(tag); break; } } catch { //ignore non-existing = null tags } try { foreach (var prop in elObject .Elements("{http://www.tei-c.org/ns/1.0}teiHeader").First() .Elements("{http://www.tei-c.org/ns/1.0}fileDesc").First() .Elements("{http://www.tei-c.org/ns/1.0}titleStmt").First() .Elements("{http://www.tei-c.org/ns/1.0}author")) { string label = prop.Value; tag = await TagHandler.PrepareAttribute(context, "Contributor Names", label, 1); meta.Add(tag); break; } } catch { //ignore non-existing = null tags } try { foreach (var prop in elObject .Elements("{http://www.tei-c.org/ns/1.0}teiHeader").First() .Elements("{http://www.tei-c.org/ns/1.0}fileDesc").First() .Elements("{http://www.tei-c.org/ns/1.0}titleStmt").First() .Elements("{http://www.tei-c.org/ns/1.0}respStmt")) { string label = prop.Elements("{http://www.tei-c.org/ns/1.0}name").First().Value; tag = await TagHandler.PrepareAttribute(context, "Contributor Names", label, 1); meta.Add(tag); break; } } catch { //ignore non-existing = null tags } try { foreach (var prop in elObject .Elements("{http://www.tei-c.org/ns/1.0}teiHeader").First() .Elements("{http://www.tei-c.org/ns/1.0}fileDesc").First() .Elements("{http://www.tei-c.org/ns/1.0}notesStmt").First() .Elements("{http://www.tei-c.org/ns/1.0}note")) { string label = prop.Value; tag = await TagHandler.PrepareAttribute(context, "Notes", label, 1); meta.Add(tag); } } catch { //ignore non-existing = null tags } tag = await TagHandler.PrepareAttribute(context, "Source", "Digitized Walters Manuscripts", 1); tag.ValueSupplement = $"http://www.thedigitalwalters.org/Data/WaltersManuscripts/html/{job.ResourceNumber}/"; meta.Add(tag); book.Tags = meta.ToArray(); List <RArtifactItemRecord> pages = new List <RArtifactItemRecord>(); int order = 0; foreach (var surface in elObject .Elements("{http://www.tei-c.org/ns/1.0}facsimile").First() .Elements("{http://www.tei-c.org/ns/1.0}surface")) { foreach (var graphic in surface.Elements("{http://www.tei-c.org/ns/1.0}graphic")) { if (graphic.Attribute("url").Value.Contains("sap.jpg")) { order++; using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.ProgressPercent = order; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } RArtifactItemRecord page = new RArtifactItemRecord() { Name = $"تصویر {order}", NameInEnglish = $"Image {order}", Description = "", DescriptionInEnglish = "", Order = order, FriendlyUrl = $"p{$"{order}".PadLeft(4, '0')}", LastModified = DateTime.Now }; string imageUrl = $"http://www.thedigitalwalters.org/Data/WaltersManuscripts/{resourceNumber}/data/W.{resourceNumber.Substring(1)}/{graphic.Attribute("url").Value}"; tag = await TagHandler.PrepareAttribute(context, "Source", "Digitized Walters Manuscripts", 1); tag.ValueSupplement = imageUrl; page.Tags = new RTagValue[] { tag }; if (!string.IsNullOrEmpty(imageUrl)) { bool recovered = false; if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { RServiceResult <RPictureFile> picture = await _pictureFileService.RecoverFromeFiles(page.Name, page.Description, 1, imageUrl, Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg"), $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result != null) { recovered = true; page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } if (!recovered) { if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ); } var imageResult = await client.GetAsync(imageUrl); int _ImportRetryCount = 5; int _ImportRetryInitialSleep = 500; int retryCount = 0; while (retryCount < _ImportRetryCount && !imageResult.IsSuccessStatusCode && imageResult.StatusCode == HttpStatusCode.ServiceUnavailable) { imageResult.Dispose(); Thread.Sleep(_ImportRetryInitialSleep * (retryCount + 1)); imageResult = await client.GetAsync(imageUrl); retryCount++; } if (imageResult.IsSuccessStatusCode) { using (Stream imageStream = await imageResult.Content.ReadAsStreamAsync()) { RServiceResult <RPictureFile> picture = await _pictureFileService.Add(page.Name, page.Description, 1, null, imageUrl, imageStream, $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result == null) { throw new Exception($"_pictureFileService.Add : {picture.ExceptionString}"); } page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({imageResult.StatusCode}) for page {order}, url {imageUrl}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } imageResult.Dispose(); return; } imageResult.Dispose(); GC.Collect(); } } } } } book.Items = pages.ToArray(); book.ItemCount = pages.Count; if (pages.Count == 0) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "Pages.Count == 0"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } await context.Artifacts.AddAsync(book); await context.SaveChangesAsync(); job.ProgressPercent = 100; job.Status = ImportJobStatus.Succeeded; job.ArtifactId = book.Id; job.EndTime = DateTime.Now; context.Update(job); await context.SaveChangesAsync(); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({result.StatusCode}) for {url}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } } } } catch (Exception exp) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = exp.ToString(); importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } } } ); return(new RServiceResult <bool>(true)); } catch (Exception exp) { return(new RServiceResult <bool>(false, exp.ToString())); } }
private async Task <RServiceResult <List <RArtifactItemRecord> > > _InternalHarvardJsonImport(string hardvardResourceNumber, ImportJob job, string friendlyUrl, RMuseumDbContext context, RArtifactMasterRecord book, List <RTagValue> meta) { List <RArtifactItemRecord> pages = new List <RArtifactItemRecord>(); using (var client = new HttpClient()) { using (var jsonResult = await client.GetAsync($"https://iiif.lib.harvard.edu/manifests/drs:{hardvardResourceNumber}")) { if (jsonResult.IsSuccessStatusCode) { string json = await jsonResult.Content.ReadAsStringAsync(); var parsed = JObject.Parse(json); book.Name = book.NameInEnglish = book.Description = book.DescriptionInEnglish = parsed.SelectToken("label").Value <string>(); RTagValue tag; tag = await TagHandler.PrepareAttribute(context, "Title", book.Name, 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Contributor Names", "تعیین نشده", 1); meta.Add(tag); List <string> labels = new List <string>(); foreach (JToken structure in parsed.SelectTokens("$.structures[*].label")) { labels.Add(structure.Value <string>()); } int order = 0; var canvases = parsed.SelectToken("sequences").First().SelectToken("canvases").ToArray(); int pageCount = canvases.Length; foreach (JToken canvas in canvases) { order++; using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.ProgressPercent = order * 100 / (decimal)pageCount; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } string label = canvas.SelectToken("label").Value <string>(); if (labels.Where(l => l.IndexOf(label) != -1).SingleOrDefault() != null) { label = labels.Where(l => l.IndexOf(label) != -1).SingleOrDefault(); } string imageUrl = canvas.SelectTokens("images[*]").First().SelectToken("resource").SelectToken("@id").Value <string>(); RArtifactItemRecord page = new RArtifactItemRecord() { Name = $"تصویر {order}", NameInEnglish = label, Description = "", DescriptionInEnglish = "", Order = order, FriendlyUrl = $"p{$"{order}".PadLeft(4, '0')}", LastModified = DateTime.Now }; tag = await TagHandler.PrepareAttribute(context, "Source", "Harvard University Islamic Heritage Project", 1); tag.ValueSupplement = imageUrl; page.Tags = new RTagValue[] { tag }; if (!string.IsNullOrEmpty(imageUrl)) { bool recovered = false; if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { RServiceResult <RPictureFile> picture = await _pictureFileService.RecoverFromeFiles(page.Name, page.Description, 1, imageUrl, Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg"), $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result != null) { recovered = true; page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } if (!recovered) { if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ); } var imageResult = await client.GetAsync(imageUrl); int _ImportRetryCount = 5; int _ImportRetryInitialSleep = 500; int retryCount = 0; while (retryCount < _ImportRetryCount && !imageResult.IsSuccessStatusCode && imageResult.StatusCode == HttpStatusCode.ServiceUnavailable) { imageResult.Dispose(); Thread.Sleep(_ImportRetryInitialSleep * (retryCount + 1)); imageResult = await client.GetAsync(imageUrl); retryCount++; } if (imageResult.IsSuccessStatusCode) { using (Stream imageStream = await imageResult.Content.ReadAsStreamAsync()) { RServiceResult <RPictureFile> picture = await _pictureFileService.Add(page.Name, page.Description, 1, null, imageUrl, imageStream, $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result == null) { throw new Exception($"_pictureFileService.Add : {picture.ExceptionString}"); } page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({imageResult.StatusCode}) for page {order}, url {imageUrl}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } imageResult.Dispose(); return(new RServiceResult <List <RArtifactItemRecord> >(null, "failed")); } imageResult.Dispose(); GC.Collect(); } } } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({jsonResult.StatusCode}) for https://iiif.lib.harvard.edu/manifests/drs:{hardvardResourceNumber}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return(new RServiceResult <List <RArtifactItemRecord> >(null, "failed")); } } } return(new RServiceResult <List <RArtifactItemRecord> >(pages)); }
/// <summary> /// from https://curiosity.lib.harvard.edu /// </summary> /// <param name="url">example: https://curiosity.lib.harvard.edu/islamic-heritage-project/catalog/40-990114893240203941</param> /// <param name="friendlyUrl"></param> /// <returns></returns> private async Task <RServiceResult <bool> > StartImportingFromHarvard(string url, string friendlyUrl) { try { if ( ( await _context.ImportJobs .Where(j => j.JobType == JobType.Harvard && j.ResourceNumber == url && !(j.Status == ImportJobStatus.Failed || j.Status == ImportJobStatus.Aborted)) .SingleOrDefaultAsync() ) != null ) { return(new RServiceResult <bool>(false, $"Job is already scheduled or running for importing {url}")); } if (string.IsNullOrEmpty(friendlyUrl)) { return(new RServiceResult <bool>(false, $"Friendly url is empty, url = {url}")); } if ( (await _context.Artifacts.Where(a => a.FriendlyUrl == friendlyUrl).SingleOrDefaultAsync()) != null ) { return(new RServiceResult <bool>(false, $"duplicated friendly url '{friendlyUrl}'")); } ImportJob job = new ImportJob() { JobType = JobType.Harvard, ResourceNumber = url, FriendlyUrl = friendlyUrl, SrcUrl = url, QueueTime = DateTime.Now, ProgressPercent = 0, Status = ImportJobStatus.NotStarted }; await _context.ImportJobs.AddAsync ( job ); await _context.SaveChangesAsync(); _backgroundTaskQueue.QueueBackgroundWorkItem ( async token => { try { using (var client = new HttpClient()) { using (var result = await client.GetAsync(url)) { if (result.IsSuccessStatusCode) { using (RMuseumDbContext context = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { RArtifactMasterRecord book = new RArtifactMasterRecord($"extracted from {url}", $"extracted from {url}") { Status = PublishStatus.Draft, DateTime = DateTime.Now, LastModified = DateTime.Now, CoverItemIndex = 0, FriendlyUrl = friendlyUrl }; List <RTagValue> meta = new List <RTagValue>(); RTagValue tag; string html = await result.Content.ReadAsStringAsync(); using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.StartTime = DateTime.Now; job.Status = ImportJobStatus.Running; job.SrcContent = html; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } int nStartIndex = html.IndexOf("<dt"); while (nStartIndex != -1) { nStartIndex = html.IndexOf(">", nStartIndex); if (nStartIndex == -1) { break; } nStartIndex++; string tagName = html.Substring(nStartIndex, html.IndexOf(":", nStartIndex) - nStartIndex); nStartIndex = html.IndexOf("<dd", nStartIndex); if (nStartIndex == -1) { break; } nStartIndex = html.IndexOf(">", nStartIndex); if (nStartIndex == -1) { break; } nStartIndex++; string tagValues = html.Substring(nStartIndex, html.IndexOf("</dd>", nStartIndex) - nStartIndex); foreach (string tagValuePart in tagValues.Split("<br/>", StringSplitOptions.RemoveEmptyEntries)) { string tagValue = tagValuePart; bool href = false; if (tagValue.IndexOf("<a href=") != -1) { href = true; tagValue = tagValue.Substring(tagValue.IndexOf('>') + 1); tagValue = tagValue.Substring(0, tagValue.IndexOf('<')); } tag = await TagHandler.PrepareAttribute(context, tagName, tagValue, 1); if (href) { tag.ValueSupplement = tagValue; } meta.Add(tag); } nStartIndex = html.IndexOf("<dt", nStartIndex + 1); } tag = await TagHandler.PrepareAttribute(context, "Type", "Book", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Manuscript", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Source", "Harvard University Islamic Heritage Project", 1); tag.ValueSupplement = $"{job.SrcUrl}"; meta.Add(tag); nStartIndex = html.IndexOf("https://pds.lib.harvard.edu/pds/view/"); if (nStartIndex == -1) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "Not found https://pds.lib.harvard.edu/pds/view/"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } nStartIndex += "https://pds.lib.harvard.edu/pds/view/".Length; string hardvardResourceNumber = html.Substring(nStartIndex, html.IndexOf('\"', nStartIndex) - nStartIndex); List <RArtifactItemRecord> pages = (await _InternalHarvardJsonImport(hardvardResourceNumber, job, friendlyUrl, context, book, meta)).Result; if (pages == null) { return; } book.Tags = meta.ToArray(); book.Items = pages.ToArray(); book.ItemCount = pages.Count; if (pages.Count == 0) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "Pages.Count == 0"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } await context.Artifacts.AddAsync(book); await context.SaveChangesAsync(); job.ProgressPercent = 100; job.Status = ImportJobStatus.Succeeded; job.ArtifactId = book.Id; job.EndTime = DateTime.Now; context.Update(job); await context.SaveChangesAsync(); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({result.StatusCode}) for {url}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } } } } catch (Exception exp) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = exp.ToString(); importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } } } ); return(new RServiceResult <bool>(true)); } catch (Exception exp) { return(new RServiceResult <bool>(false, exp.ToString())); } }
/// <summary> /// from https://catalog.hathitrust.org /// </summary> /// <param name="resourceNumber">006814127</param> /// <param name="friendlyUrl"></param> /// <returns></returns> private async Task <RServiceResult <bool> > StartImportingFromHathiTrust(string resourceNumber, string friendlyUrl) { string url = $"https://catalog.hathitrust.org/Record/{resourceNumber}.xml"; if ( ( await _context.ImportJobs .Where(j => j.JobType == JobType.HathiTrust && j.ResourceNumber == resourceNumber && !(j.Status == ImportJobStatus.Failed || j.Status == ImportJobStatus.Aborted)) .SingleOrDefaultAsync() ) != null ) { return(new RServiceResult <bool>(false, $"Job is already scheduled or running for importing {url}")); } if (string.IsNullOrEmpty(friendlyUrl)) { friendlyUrl = resourceNumber; } if ( (await _context.Artifacts.Where(a => a.FriendlyUrl == friendlyUrl).SingleOrDefaultAsync()) != null ) { return(new RServiceResult <bool>(false, $"duplicated artifact friendly url '{friendlyUrl}'")); } ImportJob job = new ImportJob() { JobType = JobType.HathiTrust, ResourceNumber = resourceNumber, FriendlyUrl = friendlyUrl, SrcUrl = url, QueueTime = DateTime.Now, ProgressPercent = 0, Status = ImportJobStatus.NotStarted }; await _context.ImportJobs.AddAsync ( job ); await _context.SaveChangesAsync(); try { _backgroundTaskQueue.QueueBackgroundWorkItem ( async token => { try { using (var client = new HttpClient()) { using (var result = await client.GetAsync(url)) { if (result.IsSuccessStatusCode) { using (RMuseumDbContext context = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { if ( (await context.Artifacts.Where(a => a.FriendlyUrl == job.FriendlyUrl).SingleOrDefaultAsync()) != null ) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "aborted because of duplicated friendly url"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } RArtifactMasterRecord book = new RArtifactMasterRecord($"extracted from {url}", $"extracted from {url}") { Status = PublishStatus.Draft, DateTime = DateTime.Now, LastModified = DateTime.Now, CoverItemIndex = 0, FriendlyUrl = friendlyUrl }; List <RTagValue> meta = new List <RTagValue>(); RTagValue tag; string xml = await result.Content.ReadAsStringAsync(); using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.StartTime = DateTime.Now; job.Status = ImportJobStatus.Running; job.SrcContent = xml; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } string title = ""; string author = ""; string pdfResourceNumber = ""; int tagOrder = 1; XElement elObject = XDocument.Parse(xml).Root; foreach (var datafield in elObject.Element("record").Elements("datafield")) { tagOrder++; if (datafield.Attribute("tag") == null) { continue; } string hathiTrustTag = datafield.Attribute("tag").Value; switch (hathiTrustTag) { case "245": case "246": foreach (var subfield in datafield.Elements("subfield")) { if (subfield.Attribute("code") != null) { if (subfield.Attribute("code").Value == "a" || subfield.Attribute("code").Value == "f") { title = (title + " " + subfield.Value).Trim(); } } } break; case "100": foreach (var subfield in datafield.Elements("subfield")) { if (subfield.Attribute("code") != null) { if (subfield.Attribute("code").Value == "a" || subfield.Attribute("code").Value == "d") { author = (author + " " + subfield.Value).Trim(); } } } break; case "HOL": foreach (var subfield in datafield.Elements("subfield")) { if (subfield.Attribute("code") != null) { if (subfield.Attribute("code").Value == "p") { pdfResourceNumber = subfield.Value; } } } break; default: { if (int.TryParse(hathiTrustTag, out int tmp)) { if (tmp >= 100 && tmp <= 900) { string note = ""; foreach (var subfield in datafield.Elements("subfield")) { if (subfield.Attribute("code") != null) { note = (note + " " + subfield.Value).Trim(); } } tag = await TagHandler.PrepareAttribute(context, "Notes", note, tagOrder); meta.Add(tag); } } } break; } } if (string.IsNullOrEmpty(pdfResourceNumber)) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "pdfResourceNumber not found"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } tag = await TagHandler.PrepareAttribute(context, "Title", title, 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Contributor Names", author, 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Book", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Manuscript", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Source", "HathiTrust Digital Library", 1); string viewerUrl = $"https://babel.hathitrust.org/cgi/pt?id={pdfResourceNumber}"; tag.ValueSupplement = viewerUrl; meta.Add(tag); book.Name = book.NameInEnglish = book.Description = book.DescriptionInEnglish = title; book.Tags = meta.ToArray(); List <RArtifactItemRecord> pages = new List <RArtifactItemRecord>(); string lastMD5hash = ""; int order = 0; while (true) { order++; using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.ProgressPercent = order; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } string imageUrl = $"https://babel.hathitrust.org/cgi/imgsrv/image?id={pdfResourceNumber};seq={order};size=1000;rotation=0"; RArtifactItemRecord page = new RArtifactItemRecord() { Name = $"تصویر {order}", NameInEnglish = $"Image {order}", Description = "", DescriptionInEnglish = "", Order = order, FriendlyUrl = $"p{$"{order}".PadLeft(4, '0')}", LastModified = DateTime.Now }; tag = await TagHandler.PrepareAttribute(context, "Source", "HathiTrust Digital Library", 1); tag.ValueSupplement = viewerUrl; page.Tags = new RTagValue[] { tag }; if (!string.IsNullOrEmpty(imageUrl)) { bool recovered = false; if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { RServiceResult <RPictureFile> picture = await _pictureFileService.RecoverFromeFiles(page.Name, page.Description, 1, imageUrl, Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg"), $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result != null) { recovered = true; page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } if (!recovered) { if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ); } var imageResult = await client.GetAsync(imageUrl); int _ImportRetryCount = 200; int _ImportRetryInitialSleep = 500; int retryCount = 0; while (retryCount < _ImportRetryCount && !imageResult.IsSuccessStatusCode && imageResult.StatusCode == HttpStatusCode.ServiceUnavailable) { imageResult.Dispose(); Thread.Sleep(_ImportRetryInitialSleep * (retryCount + 1)); imageResult = await client.GetAsync(imageUrl); retryCount++; } if (imageResult.IsSuccessStatusCode) { using (Stream imageStream = await imageResult.Content.ReadAsStreamAsync()) { RServiceResult <RPictureFile> picture = await _pictureFileService.Add(page.Name, page.Description, 1, null, imageUrl, imageStream, $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result == null) { throw new Exception($"_pictureFileService.Add : {picture.ExceptionString}"); } bool lastPage = false; using (var md5 = MD5.Create()) { string md5hash = string.Join("", md5.ComputeHash(File.ReadAllBytes(Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg"))).Select(x => x.ToString("X2"))); if (md5hash == lastMD5hash) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ); File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ); File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ); lastPage = true; } lastMD5hash = md5hash; } if (!lastPage) { page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } else { break; } } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({imageResult.StatusCode}) for page {order}, url {imageUrl}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } imageResult.Dispose(); return; } imageResult.Dispose(); GC.Collect(); } } } book.Items = pages.ToArray(); book.ItemCount = pages.Count; if (pages.Count == 0) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "ages.Count == 0"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } await context.Artifacts.AddAsync(book); await context.SaveChangesAsync(); job.ProgressPercent = 100; job.Status = ImportJobStatus.Succeeded; job.ArtifactId = book.Id; job.EndTime = DateTime.Now; context.Update(job); await context.SaveChangesAsync(); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({result.StatusCode}) for {url}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } } } } catch (Exception exp) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = exp.ToString(); importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } } } ); return(new RServiceResult <bool>(true)); } catch (Exception exp) { return(new RServiceResult <bool>(false, exp.ToString())); } }
/// <summary> /// import from https://viewer.cbl.ie /// </summary> /// <param name="resourceNumber">119</param> /// <param name="friendlyUrl">golestan-baysonghori</param> /// <returns></returns> private async Task <RServiceResult <bool> > StartImportingFromChesterBeatty(string resourceNumber, string friendlyUrl) { try { string srcUrl = $"https://viewer.cbl.ie/viewer/object/Per_{resourceNumber}/1/"; if ( ( await _context.ImportJobs .Where(j => j.JobType == JobType.ChesterBeatty && j.ResourceNumber == resourceNumber && !(j.Status == ImportJobStatus.Failed || j.Status == ImportJobStatus.Aborted)) .SingleOrDefaultAsync() ) != null ) { return(new RServiceResult <bool>(false, $"Job is already scheduled or running for importing {srcUrl}")); } if (string.IsNullOrEmpty(friendlyUrl)) { return(new RServiceResult <bool>(false, $"Friendly url is empty, server folder {srcUrl}")); } if ( (await _context.Artifacts.Where(a => a.FriendlyUrl == friendlyUrl).SingleOrDefaultAsync()) != null ) { return(new RServiceResult <bool>(false, $"duplicated friendly url '{friendlyUrl}'")); } ImportJob job = new ImportJob() { JobType = JobType.ChesterBeatty, ResourceNumber = resourceNumber, FriendlyUrl = friendlyUrl, SrcUrl = srcUrl, QueueTime = DateTime.Now, ProgressPercent = 0, Status = ImportJobStatus.NotStarted }; await _context.ImportJobs.AddAsync ( job ); await _context.SaveChangesAsync(); _backgroundTaskQueue.QueueBackgroundWorkItem ( async token => { try { using (RMuseumDbContext context = new RMuseumDbContext(Configuration)) { RArtifactMasterRecord book = new RArtifactMasterRecord($"extracted from url {job.ResourceNumber}", $"extracted from url {job.ResourceNumber}") { Status = PublishStatus.Draft, DateTime = DateTime.Now, LastModified = DateTime.Now, CoverItemIndex = 0, FriendlyUrl = friendlyUrl, }; List <RTagValue> meta = new List <RTagValue>(); RTagValue tag; tag = await TagHandler.PrepareAttribute(context, "Type", "Book", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Manuscript", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Source", "Chester Beatty Digital Collections", 1); tag.ValueSupplement = srcUrl; meta.Add(tag); using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.StartTime = DateTime.Now; job.Status = ImportJobStatus.Running; job.SrcContent = ""; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } List <RArtifactItemRecord> pages = new List <RArtifactItemRecord>(); int order = 0; using (var client = new HttpClient()) do { order++; using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.ProgressPercent = order; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } RArtifactItemRecord page = new RArtifactItemRecord() { Name = $"تصویر {order}", NameInEnglish = $"Image {order} of {book.NameInEnglish}", Description = "", DescriptionInEnglish = "", Order = order, FriendlyUrl = $"p{$"{order}".PadLeft(4, '0')}", LastModified = DateTime.Now }; string imageUrl = $"https://viewer.cbl.ie/viewer/rest/image/Per_{resourceNumber}/Per{resourceNumber}_{$"{order}".PadLeft(3, '0')}.jpg/full/!10000,10000/0/default.jpg?ignoreWatermark=true"; page.Tags = new RTagValue[] { }; bool recovered = false; if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { RServiceResult <RPictureFile> picture = await _pictureFileService.RecoverFromeFiles(page.Name, page.Description, 1, imageUrl, Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg"), $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result != null) { recovered = true; page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } tag = await TagHandler.PrepareAttribute(context, "Source", "Chester Beatty Digital Collections", 1); tag.ValueSupplement = $"https://viewer.cbl.ie/viewer/object/Per_{resourceNumber}/{$"{order}".PadLeft(3, '0')}/";; page.Tags = new RTagValue[] { tag }; pages.Add(page); } } if (!recovered) { if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ); } var imageResult = await client.GetAsync(imageUrl); if (imageResult.StatusCode == HttpStatusCode.Forbidden || imageResult.StatusCode == HttpStatusCode.NotFound) { break; } int _ImportRetryCount = 5; int _ImportRetryInitialSleep = 500; int retryCount = 0; while (retryCount < _ImportRetryCount && !imageResult.IsSuccessStatusCode && imageResult.StatusCode == HttpStatusCode.ServiceUnavailable) { imageResult.Dispose(); Thread.Sleep(_ImportRetryInitialSleep * (retryCount + 1)); imageResult = await client.GetAsync(imageUrl); retryCount++; } if (imageResult.IsSuccessStatusCode) { using (Stream imageStream = await imageResult.Content.ReadAsStreamAsync()) { RServiceResult <RPictureFile> picture = await _pictureFileService.Add(page.Name, page.Description, 1, null, imageUrl, imageStream, $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result == null) { throw new Exception($"_pictureFileService.Add : {picture.ExceptionString}"); } page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } tag = await TagHandler.PrepareAttribute(context, "Source", "Chester Beatty Digital Collections", 1); tag.ValueSupplement = $"https://viewer.cbl.ie/viewer/object/Per_{resourceNumber}/{$"{order}".PadLeft(3, '0')}/"; page.Tags = new RTagValue[] { tag }; pages.Add(page); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({imageResult.StatusCode}) for page {order}, url {imageUrl}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } imageResult.Dispose(); return; } imageResult.Dispose(); GC.Collect(); } pages.Add(page); }while (true); book.Tags = meta.ToArray(); book.Items = pages.ToArray(); book.ItemCount = pages.Count; if (pages.Count == 0) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "Pages.Count == 0"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } await context.Artifacts.AddAsync(book); await context.SaveChangesAsync(); job.ProgressPercent = 100; job.Status = ImportJobStatus.Succeeded; job.ArtifactId = book.Id; job.EndTime = DateTime.Now; context.Update(job); await context.SaveChangesAsync(); } } catch (Exception exp) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = exp.ToString(); importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } } } ); return(new RServiceResult <bool>(true)); } catch (Exception exp) { return(new RServiceResult <bool>(false, exp.ToString())); } }
/// <summary> /// import from http://www.qajarwomen.org /// </summary> /// <param name="hardvardResourceNumber">43117279</param> /// <param name="friendlyUrl">atame</param> /// <param name="srcUrl">http://www.qajarwomen.org/fa/items/1018A10.html</param> /// <returns></returns> private async Task <RServiceResult <bool> > StartImportingFromHarvardDirectly(string hardvardResourceNumber, string friendlyUrl, string srcUrl) { try { if ( ( await _context.ImportJobs .Where(j => j.JobType == JobType.HarvardDirect && j.ResourceNumber == hardvardResourceNumber && !(j.Status == ImportJobStatus.Failed || j.Status == ImportJobStatus.Aborted)) .SingleOrDefaultAsync() ) != null ) { return(new RServiceResult <bool>(false, $"Job is already scheduled or running for importing harvard direct resource number {hardvardResourceNumber}")); } if (string.IsNullOrEmpty(friendlyUrl)) { return(new RServiceResult <bool>(false, $"Friendly url is empty, harvard direct resource number {hardvardResourceNumber}")); } if ( (await _context.Artifacts.Where(a => a.FriendlyUrl == friendlyUrl).SingleOrDefaultAsync()) != null ) { return(new RServiceResult <bool>(false, $"duplicated friendly url '{friendlyUrl}'")); } ImportJob job = new ImportJob() { JobType = JobType.HarvardDirect, ResourceNumber = hardvardResourceNumber, FriendlyUrl = friendlyUrl, SrcUrl = srcUrl, QueueTime = DateTime.Now, ProgressPercent = 0, Status = ImportJobStatus.NotStarted }; await _context.ImportJobs.AddAsync ( job ); await _context.SaveChangesAsync(); _backgroundTaskQueue.QueueBackgroundWorkItem ( async token => { try { using (RMuseumDbContext context = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { RArtifactMasterRecord book = new RArtifactMasterRecord($"extracted from harvard resource number {job.ResourceNumber}", $"extracted from harvard resource number {job.ResourceNumber}") { Status = PublishStatus.Draft, DateTime = DateTime.Now, LastModified = DateTime.Now, CoverItemIndex = 0, FriendlyUrl = friendlyUrl }; List <RTagValue> meta = new List <RTagValue>(); RTagValue tag; tag = await TagHandler.PrepareAttribute(context, "Notes", "وارد شده از سایت دنیای زنان در عصر قاجار", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Book", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Manuscript", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Source", "دنیای زنان در عصر قاجار", 1); tag.ValueSupplement = $"{job.SrcUrl}"; meta.Add(tag); using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.StartTime = DateTime.Now; job.Status = ImportJobStatus.Running; job.SrcContent = $"https://iiif.lib.harvard.edu/manifests/drs:{hardvardResourceNumber}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } List <RArtifactItemRecord> pages = (await _InternalHarvardJsonImport(hardvardResourceNumber, job, friendlyUrl, context, book, meta)).Result; if (pages == null) { return; } book.Tags = meta.ToArray(); book.Items = pages.ToArray(); book.ItemCount = pages.Count; if (pages.Count == 0) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "Pages.Count == 0"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } await context.Artifacts.AddAsync(book); await context.SaveChangesAsync(); job.ProgressPercent = 100; job.Status = ImportJobStatus.Succeeded; job.ArtifactId = book.Id; job.EndTime = DateTime.Now; context.Update(job); await context.SaveChangesAsync(); } } catch (Exception exp) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = exp.ToString(); importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } } } ); return(new RServiceResult <bool>(true)); } catch (Exception exp) { return(new RServiceResult <bool>(false, exp.ToString())); } }
/// <summary> /// from http://www.bl.uk /// </summary> /// <param name="resourceNumber">grenville_xli_f001r</param> /// <param name="friendlyUrl"></param> /// <returns></returns> private async Task <RServiceResult <bool> > StartImportingFromBritishLibrary(string resourceNumber, string friendlyUrl) { string url = $"http://www.bl.uk/manuscripts/Viewer.aspx?ref={resourceNumber}"; if ( ( await _context.ImportJobs .Where(j => j.JobType == JobType.BritishLibrary && j.ResourceNumber == resourceNumber && !(j.Status == ImportJobStatus.Failed || j.Status == ImportJobStatus.Aborted)) .SingleOrDefaultAsync() ) != null ) { return(new RServiceResult <bool>(false, $"Job is already scheduled or running for importing {url}")); } if (string.IsNullOrEmpty(friendlyUrl)) { friendlyUrl = resourceNumber; } if ( (await _context.Artifacts.Where(a => a.FriendlyUrl == friendlyUrl).SingleOrDefaultAsync()) != null ) { return(new RServiceResult <bool>(false, $"duplicated artifact friendly url '{friendlyUrl}'")); } ImportJob job = new ImportJob() { JobType = JobType.BritishLibrary, ResourceNumber = resourceNumber, FriendlyUrl = friendlyUrl, SrcUrl = url, QueueTime = DateTime.Now, ProgressPercent = 0, Status = ImportJobStatus.NotStarted }; await _context.ImportJobs.AddAsync ( job ); await _context.SaveChangesAsync(); try { _backgroundTaskQueue.QueueBackgroundWorkItem ( async token => { try { using (var client = new HttpClient()) { client.Timeout = TimeSpan.FromMinutes(5); using (var result = await client.GetAsync(url)) { if (result.IsSuccessStatusCode) { using (RMuseumDbContext context = new RMuseumDbContext(Configuration)) { if ( (await context.Artifacts.Where(a => a.FriendlyUrl == job.FriendlyUrl).SingleOrDefaultAsync()) != null ) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "aborted because of duplicated friendly url"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } RArtifactMasterRecord book = new RArtifactMasterRecord($"extracted from {url}", $"extracted from {url}") { Status = PublishStatus.Draft, DateTime = DateTime.Now, LastModified = DateTime.Now, CoverItemIndex = 0, FriendlyUrl = friendlyUrl }; List <RTagValue> meta = new List <RTagValue>(); RTagValue tag; string html = await result.Content.ReadAsStringAsync(); using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.StartTime = DateTime.Now; job.Status = ImportJobStatus.Running; job.SrcContent = html; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } int nIdxStart = html.IndexOf("PageList"); if (nIdxStart == -1) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "PageList not found"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } nIdxStart = html.IndexOf("value=\"", nIdxStart); if (nIdxStart == -1) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "value after PageList not found"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } nIdxStart += "value=\"".Length; string strPageList = html.Substring(nIdxStart, html.IndexOf('"', nIdxStart) - nIdxStart); nIdxStart = html.IndexOf("TextList"); if (nIdxStart == -1) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "TextList not found"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } nIdxStart = html.IndexOf("value=\"", nIdxStart); if (nIdxStart == -1) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "value after TextList not found"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } nIdxStart += "value=\"".Length; string strTextList = html.Substring(nIdxStart, html.IndexOf('"', nIdxStart) - nIdxStart); nIdxStart = html.IndexOf("TitleList"); if (nIdxStart == -1) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "TitleList not found"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } nIdxStart = html.IndexOf("value=\"", nIdxStart); if (nIdxStart == -1) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "value after TitleList not found"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } nIdxStart += "value=\"".Length; string strTitleList = html.Substring(nIdxStart, html.IndexOf('"', nIdxStart) - nIdxStart); string[] PageUrls = strPageList.Split("||", StringSplitOptions.None); string[] PageTexts = strTextList.Split("||", StringSplitOptions.None); string[] PageTitles = strTitleList.Split("||", StringSplitOptions.None); if (PageUrls.Length != PageTexts.Length || PageTexts.Length != PageTitles.Length) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "PageUrls.Length != PageTexts.Length || PageTexts.Length != PageTitles.Length"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } tag = await TagHandler.PrepareAttribute(context, "Title", "Untitled", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Contributor Names", "Unknown", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Book", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Manuscript", 1); meta.Add(tag); book.Tags = meta.ToArray(); tag = await TagHandler.PrepareAttribute(context, "Source", "British Library", 1); string viewerUrl = $"http://www.bl.uk/manuscripts/FullDisplay.aspx?ref={resourceNumber.Substring(0, resourceNumber.LastIndexOf('_'))}"; tag.ValueSupplement = viewerUrl; meta.Add(tag); List <RArtifactItemRecord> pages = new List <RArtifactItemRecord>(); int order = 0; for (int i = 0; i < PageUrls.Length; i++) { if (PageUrls[i] == "##") { continue; } order++; using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.ProgressPercent = order; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } RArtifactItemRecord page = new RArtifactItemRecord() { Name = $"تصویر {order}", NameInEnglish = $"Image {order}", Description = "", DescriptionInEnglish = "", Order = order, FriendlyUrl = $"p{$"{order}".PadLeft(4, '0')}", LastModified = DateTime.Now }; List <RTagValue> pageTags = new List <RTagValue>(); tag = await TagHandler.PrepareAttribute(context, "Source", "British Library", 1); tag.ValueSupplement = $"http://www.bl.uk/manuscripts/Viewer.aspx?ref={PageUrls[i]}"; pageTags.Add(tag); if (!string.IsNullOrEmpty(PageTitles[i])) { RTagValue toc = await TagHandler.PrepareAttribute(context, "Title in TOC", PageTitles[i], 1); toc.ValueSupplement = "1"; //font size pageTags.Add(toc); } if (!string.IsNullOrEmpty(PageTexts[i])) { tag = await TagHandler.PrepareAttribute(context, "Label", PageTexts[i], 1); pageTags.Add(tag); } page.Tags = pageTags.ToArray(); bool recovered = false; if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { RServiceResult <RPictureFile> picture = await _pictureFileService.RecoverFromeFiles(page.Name, page.Description, 1, viewerUrl, Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg"), $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result != null) { recovered = true; page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } if (!recovered) { if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ); } /* * failed multithread attempt: * * BLTileMixer mixer = new BLTileMixer(); * RServiceResult<Stream> blResult = await mixer.DownloadMix(PageUrls[i], order); */ Dictionary <(int x, int y), Image> tiles = new Dictionary <(int x, int y), Image>(); int max_x = -1; for (int x = 0; ; x++) { string imageUrl = $"http://www.bl.uk/manuscripts/Proxy.ashx?view={PageUrls[i]}_files/13/{x}_0.jpg"; var imageResult = await client.GetAsync(imageUrl); int _ImportRetryCount = 5; int _ImportRetryInitialSleep = 500; int retryCount = 0; while (retryCount < _ImportRetryCount && !imageResult.IsSuccessStatusCode && imageResult.StatusCode == HttpStatusCode.ServiceUnavailable) { imageResult.Dispose(); Thread.Sleep(_ImportRetryInitialSleep * (retryCount + 1)); imageResult = await client.GetAsync(imageUrl); retryCount++; } if (imageResult.IsSuccessStatusCode) { using (Stream imageStream = await imageResult.Content.ReadAsStreamAsync()) { imageStream.Position = 0; try { Image tile = Image.FromStream(imageStream); tiles.Add((x, 0), tile); max_x = x; } catch (Exception aexp) { if (aexp is ArgumentException) { break; } throw aexp; } } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({imageResult.StatusCode}) for page {order}, url {imageUrl}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } imageResult.Dispose(); return; } } int max_y = -1; for (int y = 1; ; y++) { string imageUrl = $"http://www.bl.uk/manuscripts/Proxy.ashx?view={PageUrls[i]}_files/13/0_{y}.jpg"; var imageResult = await client.GetAsync(imageUrl); int _ImportRetryCount = 5; int _ImportRetryInitialSleep = 500; int retryCount = 0; while (retryCount < _ImportRetryCount && !imageResult.IsSuccessStatusCode && imageResult.StatusCode == HttpStatusCode.ServiceUnavailable) { imageResult.Dispose(); Thread.Sleep(_ImportRetryInitialSleep * (retryCount + 1)); imageResult = await client.GetAsync(imageUrl); retryCount++; } if (imageResult.IsSuccessStatusCode) { using (Stream imageStream = await imageResult.Content.ReadAsStreamAsync()) { if (imageStream.Length <= 248) { break; } imageStream.Position = 0; try { Image tile = Image.FromStream(imageStream); tiles.Add((0, y), tile); max_y = y; } catch (Exception aexp) { if (aexp is ArgumentException) { break; } throw aexp; } } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({imageResult.StatusCode}) for page {order}, url {imageUrl}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } imageResult.Dispose(); return; } } for (int x = 0; x <= max_x; x++) { for (int y = 0; y <= max_y; y++) { if (tiles.TryGetValue((x, y), out Image tmp) == false) { string imageUrl = $"http://www.bl.uk/manuscripts/Proxy.ashx?view={PageUrls[i]}_files/13/{x}_{y}.jpg"; var imageResult = await client.GetAsync(imageUrl); int _ImportRetryCount = 5; int _ImportRetryInitialSleep = 500; int retryCount = 0; while (retryCount < _ImportRetryCount && !imageResult.IsSuccessStatusCode && imageResult.StatusCode == HttpStatusCode.ServiceUnavailable) { imageResult.Dispose(); Thread.Sleep(_ImportRetryInitialSleep * (retryCount + 1)); imageResult = await client.GetAsync(imageUrl); retryCount++; } if (imageResult.IsSuccessStatusCode) { using (Stream imageStream = await imageResult.Content.ReadAsStreamAsync()) { if (imageStream.Length == 0) { break; } imageStream.Position = 0; tiles.Add((x, y), Image.FromStream(imageStream)); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({imageResult.StatusCode}) for page {order}, url {imageUrl}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } imageResult.Dispose(); return; } } } } int tileWidth = tiles[(0, 0)].Width; int tileHeight = tiles[(0, 0)].Height;
/// <summary> /// from http://pudl.princeton.edu/ /// </summary> /// <param name="resourceNumber">dj52w476m</param> /// <param name="friendlyUrl"></param> /// <returns></returns> private async Task <RServiceResult <bool> > StartImportingFromPrinceton(string resourceNumber, string friendlyUrl) { string url = $"http://pudl.princeton.edu/mdCompiler2.php?obj={resourceNumber}"; if ( ( await _context.ImportJobs .Where(j => j.JobType == JobType.Princeton && j.ResourceNumber == resourceNumber && !(j.Status == ImportJobStatus.Failed || j.Status == ImportJobStatus.Aborted)) .SingleOrDefaultAsync() ) != null ) { return(new RServiceResult <bool>(false, $"Job is already scheduled or running for importing {url}")); } if (string.IsNullOrEmpty(friendlyUrl)) { friendlyUrl = resourceNumber; } if ( (await _context.Artifacts.Where(a => a.FriendlyUrl == friendlyUrl).SingleOrDefaultAsync()) != null ) { return(new RServiceResult <bool>(false, $"duplicated friendly url '{friendlyUrl}'")); } ImportJob job = new ImportJob() { JobType = JobType.Princeton, ResourceNumber = resourceNumber, FriendlyUrl = friendlyUrl, SrcUrl = url, QueueTime = DateTime.Now, ProgressPercent = 0, Status = ImportJobStatus.NotStarted }; await _context.ImportJobs.AddAsync ( job ); await _context.SaveChangesAsync(); try { _backgroundTaskQueue.QueueBackgroundWorkItem ( async token => { try { using (var client = new HttpClient()) { using (var result = await client.GetAsync(url)) { if (result.IsSuccessStatusCode) { using (RMuseumDbContext context = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { RArtifactMasterRecord book = new RArtifactMasterRecord($"extracted from {url}", $"extracted from {url}") { Status = PublishStatus.Draft, DateTime = DateTime.Now, LastModified = DateTime.Now, CoverItemIndex = 0, FriendlyUrl = friendlyUrl }; List <RTagValue> meta = new List <RTagValue>(); RTagValue tag; string xml = await result.Content.ReadAsStringAsync(); using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.StartTime = DateTime.Now; job.Status = ImportJobStatus.Running; job.SrcContent = xml; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } XElement elObject = XDocument.Parse(xml).Root; foreach (var prop in elObject.Element("dmd").Element("properties").Elements("property")) { if (prop.Element("label") == null) { continue; } string label = prop.Element("label").Value.Replace(":", ""); int order = 1; foreach (var value in prop.Elements("valueGrp").Elements("value")) { tag = await TagHandler.PrepareAttribute(context, label, value.Value, order); if (value.Attribute("href") != null) { if (value.Attribute("href").Value.IndexOf("http://localhost") != 0) { tag.ValueSupplement = value.Attribute("href").Value; } } meta.Add(tag); if (label == "Title") { book.Name = book.NameInEnglish = book.Description = book.DescriptionInEnglish = value.Value; } order++; } } tag = await TagHandler.PrepareAttribute(context, "Type", "Book", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Manuscript", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Source", "Princeton Digital Library of Islamic Manuscripts", 1); tag.ValueSupplement = $"http://pudl.princeton.edu/objects/{job.ResourceNumber}"; meta.Add(tag); book.Tags = meta.ToArray(); List <RArtifactItemRecord> pages = new List <RArtifactItemRecord>(); foreach (var structure in elObject.Elements("structure")) { if (structure.Attribute("type") != null && structure.Attribute("type").Value == "RelatedObjects") { if (structure.Element("div") == null || structure.Element("div").Element("OrderedList") == null) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "structure[RelatedObjects].div.OrderedList is null"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); return; } } int pageCount = structure.Element("div").Element("OrderedList").Elements("div").Count(); int inlineOrder = 0; foreach (var div in structure.Element("div").Element("OrderedList").Elements("div")) { inlineOrder++; using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.ProgressPercent = inlineOrder * 100 / (decimal)pageCount; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } int order = int.Parse(div.Attribute("order").Value); RArtifactItemRecord page = new RArtifactItemRecord() { Name = $"تصویر {order}", NameInEnglish = div.Attribute("label").Value, Description = "", DescriptionInEnglish = "", Order = order, FriendlyUrl = $"p{$"{order}".PadLeft(4, '0')}", LastModified = DateTime.Now }; string imageUrl = div.Attribute("img").Value; imageUrl = "https://libimages.princeton.edu/loris/" + imageUrl.Substring(imageUrl.LastIndexOf(":") + 1); imageUrl += $"/full/,{div.Attribute("h").Value}/0/default.jpg"; tag = await TagHandler.PrepareAttribute(context, "Source", "Princeton Digital Library of Islamic Manuscripts", 1); tag.ValueSupplement = imageUrl; page.Tags = new RTagValue[] { tag }; if (!string.IsNullOrEmpty(imageUrl)) { bool recovered = false; if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { RServiceResult <RPictureFile> picture = await _pictureFileService.RecoverFromeFiles(page.Name, page.Description, 1, imageUrl, Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg"), $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result != null) { recovered = true; page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } if (!recovered) { if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ); } var imageResult = await client.GetAsync(imageUrl); int _ImportRetryCount = 5; int _ImportRetryInitialSleep = 500; int retryCount = 0; while (retryCount < _ImportRetryCount && !imageResult.IsSuccessStatusCode && imageResult.StatusCode == HttpStatusCode.ServiceUnavailable) { imageResult.Dispose(); Thread.Sleep(_ImportRetryInitialSleep * (retryCount + 1)); imageResult = await client.GetAsync(imageUrl); retryCount++; } if (imageResult.IsSuccessStatusCode) { using (Stream imageStream = await imageResult.Content.ReadAsStreamAsync()) { RServiceResult <RPictureFile> picture = await _pictureFileService.Add(page.Name, page.Description, 1, null, imageUrl, imageStream, $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result == null) { throw new Exception($"_pictureFileService.Add : {picture.ExceptionString}"); } page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({imageResult.StatusCode}) for page {order}, url {imageUrl}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } imageResult.Dispose(); return; } imageResult.Dispose(); GC.Collect(); } } } } } foreach (var structure in elObject.Elements("structure")) { if (structure.Attribute("type") != null && structure.Attribute("type").Value == "Physical") { if (structure.Element("RTLBoundManuscript") != null) { foreach (var leaf in structure.Element("RTLBoundManuscript").Elements("Leaf")) { foreach (var side in leaf.Elements("Side")) { int pageOrder = int.Parse(side.Attribute("order").Value); tag = await TagHandler.PrepareAttribute(context, "Leaf Side", side.Attribute("label").Value, 100); RArtifactItemRecord page = pages.Where(p => p.Order == pageOrder).SingleOrDefault(); if (page != null) { List <RTagValue> tags = new List <RTagValue>(page.Tags); tags.Add(tag); page.Tags = tags; } } } foreach (var folio in structure.Element("RTLBoundManuscript").Elements("Folio")) { foreach (var side in folio.Elements("Side")) { int pageOrder = int.Parse(side.Attribute("order").Value); tag = await TagHandler.PrepareAttribute(context, "Folio Side", folio.Attribute("label").Value + ":" + side.Attribute("label").Value, 101); RArtifactItemRecord page = pages.Where(p => p.Order == pageOrder).SingleOrDefault(); if (page != null) { List <RTagValue> tags = new List <RTagValue>(page.Tags); tags.Add(tag); page.Tags = tags; } } } } } } book.Items = pages.ToArray(); book.ItemCount = pages.Count; if (pages.Count == 0) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "ages.Count == 0"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } await context.Artifacts.AddAsync(book); await context.SaveChangesAsync(); job.ProgressPercent = 100; job.Status = ImportJobStatus.Succeeded; job.ArtifactId = book.Id; job.EndTime = DateTime.Now; context.Update(job); await context.SaveChangesAsync(); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({result.StatusCode}) for {url}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } } } } catch (Exception exp) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = exp.ToString(); importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } } } ); return(new RServiceResult <bool>(true)); } catch (Exception exp) { return(new RServiceResult <bool>(false, exp.ToString())); } }
/// <summary> /// from http://www.library.upenn.edu/ /// </summary> /// <param name="resourceNumber">MEDREN_9949222153503681</param> /// <param name="friendlyUrl"></param> /// <returns></returns> private async Task <RServiceResult <bool> > StartImportingFromPenLibraries(string resourceNumber, string friendlyUrl) { string url = $"http://dla.library.upenn.edu/dla/medren/pageturn.html?id={resourceNumber}&rotation=0&size=0"; if ( ( await _context.ImportJobs .Where(j => j.JobType == JobType.PennLibraries && j.ResourceNumber == resourceNumber && !(j.Status == ImportJobStatus.Failed || j.Status == ImportJobStatus.Aborted)) .SingleOrDefaultAsync() ) != null ) { return(new RServiceResult <bool>(false, $"Job is already scheduled or running for importing {url}")); } if (string.IsNullOrEmpty(friendlyUrl)) { friendlyUrl = resourceNumber; } if ( (await _context.Artifacts.Where(a => a.FriendlyUrl == friendlyUrl).SingleOrDefaultAsync()) != null ) { return(new RServiceResult <bool>(false, $"duplicated artifact friendly url '{friendlyUrl}'")); } ImportJob job = new ImportJob() { JobType = JobType.PennLibraries, ResourceNumber = resourceNumber, FriendlyUrl = friendlyUrl, SrcUrl = url, QueueTime = DateTime.Now, ProgressPercent = 0, Status = ImportJobStatus.NotStarted }; await _context.ImportJobs.AddAsync ( job ); await _context.SaveChangesAsync(); try { _backgroundTaskQueue.QueueBackgroundWorkItem ( async token => { try { using (var client = new HttpClient()) { client.Timeout = TimeSpan.FromMinutes(5); using (var result = await client.GetAsync(url)) { if (result.IsSuccessStatusCode) { using (RMuseumDbContext context = new RMuseumDbContext(Configuration)) { if ( (await context.Artifacts.Where(a => a.FriendlyUrl == job.FriendlyUrl).SingleOrDefaultAsync()) != null ) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "aborted because of duplicated friendly url"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } RArtifactMasterRecord book = new RArtifactMasterRecord($"extracted from {url}", $"extracted from {url}") { Status = PublishStatus.Draft, DateTime = DateTime.Now, LastModified = DateTime.Now, CoverItemIndex = 0, FriendlyUrl = friendlyUrl }; List <RTagValue> meta = new List <RTagValue>(); RTagValue tag; string html = await result.Content.ReadAsStringAsync(); using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.StartTime = DateTime.Now; job.Status = ImportJobStatus.Running; job.SrcContent = html; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } string title = ""; string author = ""; int tagOrder = 1; int nIdxStart = html.IndexOf("https://repo.library.upenn.edu/djatoka/resolver?"); if (nIdxStart == -1) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "https://repo.library.upenn.edu/djatoka/resolver? not found"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } string firstImageUrl = html.Substring(nIdxStart, html.IndexOf('"', nIdxStart) - nIdxStart).Replace("&", "&"); nIdxStart = html.IndexOf("recordinfolabel"); while (nIdxStart != -1) { nIdxStart += "recordinfolabel\">".Length; int nIdxEnd = html.IndexOf(":", nIdxStart); string recordinfolabel = html.Substring(nIdxStart, nIdxEnd - nIdxStart); nIdxStart = html.IndexOf("recordinfotext", nIdxEnd); nIdxStart += "recordinfotext\">".Length; nIdxEnd = html.IndexOf("</td>", nIdxStart); string recordinfotext = html.Substring(nIdxStart, nIdxEnd - nIdxStart).Replace("</div>", "<div>").Replace("\n", "").Replace("\r", "").Trim(); string[] values = recordinfotext.Split("<div>", StringSplitOptions.RemoveEmptyEntries); foreach (string value in values) { if (value.Trim().Length == 0) { continue; } if (recordinfolabel == "Title") { title = value.Trim(); tag = await TagHandler.PrepareAttribute(context, "Title", title, 1); meta.Add(tag); } else if (recordinfolabel == "Author") { author = value.Trim(); tag = await TagHandler.PrepareAttribute(context, "Contributor Names", author, 1); meta.Add(tag); } else { tag = await TagHandler.PrepareAttribute(context, recordinfolabel, value.Trim(), tagOrder++); meta.Add(tag); } } nIdxStart = html.IndexOf("recordinfolabel", nIdxEnd); } tag = await TagHandler.PrepareAttribute(context, "Type", "Book", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Manuscript", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Source", "Penn Libraries", 1); string viewerUrl = $"http://dla.library.upenn.edu/dla/medren/detail.html?id={resourceNumber}"; tag.ValueSupplement = viewerUrl; meta.Add(tag); book.Name = book.NameInEnglish = book.Description = book.DescriptionInEnglish = title; book.Tags = meta.ToArray(); List <RArtifactItemRecord> pages = new List <RArtifactItemRecord>(); int order = 0; while (true) { order++; using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.ProgressPercent = order; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } string imageUrl = firstImageUrl; RArtifactItemRecord page = new RArtifactItemRecord() { Name = $"تصویر {order}", NameInEnglish = $"Image {order}", Description = "", DescriptionInEnglish = "", Order = order, FriendlyUrl = $"p{$"{order}".PadLeft(4, '0')}", LastModified = DateTime.Now }; tag = await TagHandler.PrepareAttribute(context, "Source", "Penn Libraries", 1); tag.ValueSupplement = viewerUrl; page.Tags = new RTagValue[] { tag }; if (!string.IsNullOrEmpty(imageUrl)) { bool recovered = false; if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { RServiceResult <RPictureFile> picture = await _pictureFileService.RecoverFromeFiles(page.Name, page.Description, 1, imageUrl, Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg"), $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result != null) { recovered = true; page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } if (!recovered) { if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if (order > 1) { string pageUrl = $"http://dla.library.upenn.edu/dla/medren/pageturn.html?id={resourceNumber}&doubleside=0&rotation=0&size=0¤tpage={order}"; var pageResult = await client.GetAsync(pageUrl); if (pageResult.StatusCode == HttpStatusCode.NotFound) { break; //finished } string pageHtml = await pageResult.Content.ReadAsStringAsync(); nIdxStart = pageHtml.IndexOf("https://repo.library.upenn.edu/djatoka/resolver?"); if (nIdxStart == -1) { if (order > 1) { break; //finished } using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"https://repo.library.upenn.edu/djatoka/resolver? not found on page {order}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } imageUrl = pageHtml.Substring(nIdxStart, pageHtml.IndexOf('"', nIdxStart) - nIdxStart).Replace("&", "&"); } var imageResult = await client.GetAsync(imageUrl); if (imageResult.StatusCode == HttpStatusCode.NotFound) { break; //finished } int _ImportRetryCount = 200; int _ImportRetryInitialSleep = 500; int retryCount = 0; while (retryCount < _ImportRetryCount && !imageResult.IsSuccessStatusCode && imageResult.StatusCode == HttpStatusCode.ServiceUnavailable) { imageResult.Dispose(); Thread.Sleep(_ImportRetryInitialSleep * (retryCount + 1)); imageResult = await client.GetAsync(imageUrl); retryCount++; } if (imageResult.IsSuccessStatusCode) { using (Stream imageStream = await imageResult.Content.ReadAsStreamAsync()) { RServiceResult <RPictureFile> picture = await _pictureFileService.Add(page.Name, page.Description, 1, null, imageUrl, imageStream, $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result == null) { throw new Exception($"_pictureFileService.Add : {picture.ExceptionString}"); } page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({imageResult.StatusCode}) for page {order}, url {imageUrl}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } imageResult.Dispose(); return; } imageResult.Dispose(); GC.Collect(); } } } book.Items = pages.ToArray(); book.ItemCount = pages.Count; if (pages.Count == 0) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = "Pages.Count == 0"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } await context.Artifacts.AddAsync(book); await context.SaveChangesAsync(); job.ProgressPercent = 100; job.Status = ImportJobStatus.Succeeded; job.ArtifactId = book.Id; job.EndTime = DateTime.Now; context.Update(job); await context.SaveChangesAsync(); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({result.StatusCode}) for {url}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } } } } catch (Exception exp) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(Configuration)) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = exp.ToString(); importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } } } ); return(new RServiceResult <bool>(true)); } catch (Exception exp) { return(new RServiceResult <bool>(false, exp.ToString())); } }
/// <summary> /// from https://www.loc.gov /// </summary> /// <param name="resourceNumber"> /// <example> /// m084 /// </example> /// </param> /// <param name="friendlyUrl"> /// <example> /// boostan1207 /// </example> /// </param> /// <param name="resourcePrefix"></param> /// <example> /// plmp /// </example> /// <returns></returns> private async Task <RServiceResult <bool> > StartImportingFromTheLibraryOfCongress(string resourceNumber, string friendlyUrl, string resourcePrefix) { string url = $"https://www.loc.gov/resource/{resourcePrefix}.{resourceNumber}/?fo=json&st=gallery"; if ( ( await _context.ImportJobs .Where(j => j.JobType == JobType.Loc && j.ResourceNumber == resourceNumber && !(j.Status == ImportJobStatus.Failed || j.Status == ImportJobStatus.Aborted)) .SingleOrDefaultAsync() ) != null ) { return(new RServiceResult <bool>(false, $"Job is already scheduled or running for importing {url}")); } if (string.IsNullOrEmpty(friendlyUrl)) { friendlyUrl = resourceNumber; } if ( (await _context.Artifacts.Where(a => a.FriendlyUrl == friendlyUrl).SingleOrDefaultAsync()) != null ) { return(new RServiceResult <bool>(false, $"duplicated friendly url '{friendlyUrl}'")); } ImportJob job = new ImportJob() { JobType = JobType.Loc, ResourceNumber = resourceNumber, FriendlyUrl = friendlyUrl, SrcUrl = url, QueueTime = DateTime.Now, ProgressPercent = 0, Status = ImportJobStatus.NotStarted }; await _context.ImportJobs.AddAsync ( job ); await _context.SaveChangesAsync(); try { _backgroundTaskQueue.QueueBackgroundWorkItem ( async token => { try { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.StartTime = DateTime.Now; job.Status = ImportJobStatus.Running; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } int pageCount = 0; int representative_index = 0; //اول یک صفحه را میخوانیم تا تعداد صفحات را مشخص کنیم using (var client = new HttpClient()) { using (var result = await client.GetAsync(url)) { if (result.IsSuccessStatusCode) { string json = await result.Content.ReadAsStringAsync(); var parsed = JObject.Parse(json); pageCount = parsed.SelectToken("resource.segment_count").Value <int>(); representative_index = parsed.SelectToken("resource.representative_index").Value <int>(); } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({result.StatusCode}) for {url}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } } } //here might be problems: loc json does not return correct answer when number of segments are more than 1000 /* * if (pageCount > 1000) * { * job.Exception = $"Page count ({pageCount}) was cut to 1000 for this artifact due to loc bug."; * pageCount = 1000; * } */ //حالا که تعداد صفحات را داریم دوباره میخوانیم url = $"https://www.loc.gov/resource/{resourcePrefix}.{resourceNumber}/?c={pageCount}&fo=json&st=gallery"; using (var client = new HttpClient()) { using (var result = await client.GetAsync(url)) { if (result.IsSuccessStatusCode) { //here is a problem, this method could be called from a background service where _context is disposed, so I need to renew it using (RMuseumDbContext context = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { RArtifactMasterRecord book = new RArtifactMasterRecord($"extracted from {url}", $"extracted from {url}") { Status = PublishStatus.Draft, DateTime = DateTime.Now, LastModified = DateTime.Now, CoverItemIndex = representative_index, FriendlyUrl = friendlyUrl }; string json = await result.Content.ReadAsStringAsync(); job.SrcContent = json; var parsed = JObject.Parse(json); var segmentsArray = parsed.SelectToken("segments").ToArray(); //here might be problems: loc json does not return correct answer when number of segments are more than 1000 //I've added some temporary solutions prior //Here I want to log any paradox I encounter: if (segmentsArray.Length != pageCount) { job.Exception = $"Page count ({pageCount}) is not equal to number of returned resources ({segmentsArray.Length})."; } List <RTagValue> meta = new List <RTagValue>(); string string_value = await HandleSimpleValue(context, parsed, meta, "item.title", "Title"); if (!string.IsNullOrWhiteSpace(string_value)) { book.Name = string_value; book.NameInEnglish = string_value; } await HandleSimpleValue(context, parsed, meta, "item.date", "Date"); string_value = await HandleListValue(context, parsed, meta, "item.other_title", "Other Title"); if (!string.IsNullOrWhiteSpace(string_value)) { book.Name = string_value; } await HandleListValue(context, parsed, meta, "item.contributor_names", "Contributor Names"); await HandleSimpleValue(context, parsed, meta, "item.shelf_id", "Shelf ID"); await HandleListValue(context, parsed, meta, "item.created_published", "Created / Published"); await HandleListValue(context, parsed, meta, "item.subject_headings", "Subject Headings"); await HandleListValue(context, parsed, meta, "item.notes", "Notes"); await HandleListValue(context, parsed, meta, "item.medium", "Medium"); await HandleListValue(context, parsed, meta, "item.call_number", "Call Number/Physical Location"); await HandleListValue(context, parsed, meta, "item.digital_id", "Digital Id"); await HandleSimpleValue(context, parsed, meta, "item.library_of_congress_control_number", "Library of Congress Control Number"); await HandleChildrenValue(context, parsed, meta, "item.language", "Language"); await HandleListValue(context, parsed, meta, "item.online_format", "Online Format"); await HandleListValue(context, parsed, meta, "item.number_oclc", "OCLC Number"); string_value = await HandleListValue(context, parsed, meta, "item.description", "Description"); if (!string.IsNullOrEmpty(string_value)) { book.Description = string_value; book.DescriptionInEnglish = string_value; } await HandleSimpleValue(context, parsed, meta, "cite_this.chicago", "Chicago citation style"); await HandleSimpleValue(context, parsed, meta, "cite_this.apa", "APA citation style"); await HandleSimpleValue(context, parsed, meta, "cite_this.mla", "MLA citation style"); await HandleChildrenValue(context, parsed, meta, "item.dates", "Dates"); await HandleChildrenValue(context, parsed, meta, "item.contributors", "Contributors"); await HandleChildrenValue(context, parsed, meta, "item.location", "Location"); await HandleListValue(context, parsed, meta, "item.rights", "Rights & Access"); RTagValue tag = await TagHandler.PrepareAttribute(context, "Type", "Book", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Type", "Manuscript", 1); meta.Add(tag); tag = await TagHandler.PrepareAttribute(context, "Source", "Library of Congress, African and Middle East Division, Near East Section Persian Manuscript Collection", 1); tag.ValueSupplement = url; string_value = parsed.SelectToken("item.id").Value <string>(); if (!string.IsNullOrWhiteSpace(string_value)) { tag.ValueSupplement = string_value; } meta.Add(tag); book.Tags = meta.ToArray(); int order = 0; List <RArtifactItemRecord> pages = new List <RArtifactItemRecord>(); //due to loc bug for books with more than 1000 pages relying on segmentsArray changed to hard coded image urls and .... //foreach (JToken segment in segmentsArray) for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.ProgressPercent = order * 100 / (decimal)pageCount; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } order++; RArtifactItemRecord page = new RArtifactItemRecord() { Name = $"تصویر {order}", NameInEnglish = $"Image {pageIndex} of {book.NameInEnglish}", //segment.SelectToken("title").Value<string>(), Description = "", DescriptionInEnglish = "", Order = order, FriendlyUrl = $"p{$"{order}".PadLeft(4, '0')}", LastModified = DateTime.Now }; tag = await TagHandler.PrepareAttribute(context, "Source", "Library of Congress, African and Middle East Division, Near East Section Persian Manuscript Collection", 1); tag.ValueSupplement = $"http://www.loc.gov/resource/{resourcePrefix}.{resourceNumber}/?sp={pageIndex}"; //segment.SelectToken("id").Value<string>(); page.Tags = new RTagValue[] { tag }; string imageUrlPart = $"{pageIndex}".PadLeft(4, '0'); string imageUrl = $"https://tile.loc.gov/image-services/iiif/service:amed:{resourcePrefix}:{resourceNumber}:{imageUrlPart}/full/pct:100/0/default.jpg"; //string imageUrl = $"https://tile.loc.gov/image-services/iiif/service:rbc:{resourcePrefix}:2015:{resourceNumber}:{imageUrlPart}/full/pct:100/0/default.jpg"; /* * List<string> list = segment.SelectToken("image_url").ToObject<List<string>>(); * if (list != null && list.Count > 0) * { * for (int i = 0; i < list.Count; i++) * { * if (list[i].IndexOf(".jpg") != -1) * { * if (imageUrl == "") * imageUrl = list[i]; * else * { * if (imageUrl.IndexOf("#h=") != -1 && imageUrl.IndexOf("&w=", imageUrl.IndexOf("#h=")) != -1) * { * int h1 = int.Parse(imageUrl.Substring(imageUrl.IndexOf("#h=") + "#h=".Length, imageUrl.IndexOf("&w=") - imageUrl.IndexOf("#h=") - "&w=".Length)); * if (list[i].IndexOf("#h=") != -1 && list[i].IndexOf("&w=", list[i].IndexOf("#h=")) != -1) * { * int h2 = int.Parse(list[i].Substring(list[i].IndexOf("#h=") + "#h=".Length, list[i].IndexOf("&w=") - list[i].IndexOf("#h=") - "&w=".Length)); * * if (h2 > h1) * { * imageUrl = list[i]; * } * } * } * else * imageUrl = list[i]; * * } * } * } * } */ if (!string.IsNullOrEmpty(imageUrl)) { //imageUrl = "https:" + imageUrl.Substring(0, imageUrl.IndexOf('#')); bool recovered = false; if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) && File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { RServiceResult <RPictureFile> picture = await _pictureFileService.RecoverFromeFiles(page.Name, page.Description, 1, imageUrl, Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg"), Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg"), $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result != null) { recovered = true; page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } if (!recovered) { if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "orig"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "norm"), $"{order}".PadLeft(4, '0') + ".jpg") ); } if ( File.Exists ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ) ) { File.Delete ( Path.Combine(Path.Combine(Path.Combine(_pictureFileService.ImageStoragePath, friendlyUrl), "thumb"), $"{order}".PadLeft(4, '0') + ".jpg") ); } var imageResult = await client.GetAsync(imageUrl); int _ImportRetryCount = 5; int _ImportRetryInitialSleep = 500; int retryCount = 0; while (retryCount < _ImportRetryCount && !imageResult.IsSuccessStatusCode && imageResult.StatusCode == HttpStatusCode.ServiceUnavailable) { imageResult.Dispose(); Thread.Sleep(_ImportRetryInitialSleep * (retryCount + 1)); imageResult = await client.GetAsync(imageUrl); retryCount++; } if (imageResult.IsSuccessStatusCode) { using (Stream imageStream = await imageResult.Content.ReadAsStreamAsync()) { RServiceResult <RPictureFile> picture = await _pictureFileService.Add(page.Name, page.Description, 1, null, imageUrl, imageStream, $"{order}".PadLeft(4, '0') + ".jpg", friendlyUrl); if (picture.Result == null) { throw new Exception($"_pictureFileService.Add : {picture.ExceptionString}"); } page.Images = new RPictureFile[] { picture.Result }; page.CoverImageIndex = 0; if (book.CoverItemIndex == (order - 1)) { book.CoverImage = RPictureFile.Duplicate(picture.Result); } pages.Add(page); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({imageResult.StatusCode}) for page {order}, url {imageUrl}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } imageResult.Dispose(); return; } imageResult.Dispose(); GC.Collect(); } } } book.Items = pages.ToArray(); book.ItemCount = pages.Count; if (book.CoverImage == null && pages.Count > 0) { book.CoverImage = RPictureFile.Duplicate(pages[0].Images.First()); } await context.Artifacts.AddAsync(book); await context.SaveChangesAsync(); job.ProgressPercent = 100; job.Status = ImportJobStatus.Succeeded; job.ArtifactId = book.Id; job.EndTime = DateTime.Now; context.Update(job); await context.SaveChangesAsync(); } } else { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = $"Http result is not ok ({result.StatusCode}) for {url}"; importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } return; } } } } catch (Exception exp) { using (RMuseumDbContext importJobUpdaterDb = new RMuseumDbContext(new DbContextOptions <RMuseumDbContext>())) { job.EndTime = DateTime.Now; job.Status = ImportJobStatus.Failed; job.Exception = exp.ToString(); importJobUpdaterDb.Update(job); await importJobUpdaterDb.SaveChangesAsync(); } } } ); return(new RServiceResult <bool>(true)); } catch (Exception exp) { return(new RServiceResult <bool>(false, exp.ToString())); } }