/// <summary> /// Returns zip representation of the provided documents content (check: <see cref="DocumentModel.Raw"/>) /// </summary> /// <param name="documentGroup">Document group with documents to be zipped</param> /// <returns></returns> public static byte[] DocumentGroup(DocumentGroupModel documentGroup) { using (var outputMemStream = new MemoryStream()) using (var s = new ZipOutputStream(outputMemStream)) { s.SetLevel(9); // 0-9, 9 being the highest compression byte[] buffer = new byte[4096]; foreach (var doc in documentGroup.Documents) { var entry = new ZipEntry(Path.GetFileName(doc.Name)); // entry.DateTime = DateTime.Now; s.PutNextEntry(entry); using (var ms = new MemoryStream(doc.Raw, 0, doc.Raw.Length)) { int sourceBytes; do { sourceBytes = ms.Read(buffer, 0, buffer.Length); s.Write(buffer, 0, sourceBytes); }while (sourceBytes > 0); } } s.Finish(); return(outputMemStream.ToArray()); } }
private static void ValidateDocumentNames(DocumentGroupModel crawledDocumentGroup) { // check that all names are unique var areUnique = crawledDocumentGroup.Documents.Count == crawledDocumentGroup.Documents.Select(x => x.Name).Distinct().Count(); if (areUnique == false) { throw new ArgumentException($"Document names must be unique! Document group name={crawledDocumentGroup.Name}"); } }
private async Task AddOrUpdatePackageToDatabase(BaseHttpCrawler crawler, CrawlerDownloadPackageModel package, int crawlerId) { var documentGroupModel = new DocumentGroupModel { Name = package.Name.Value, TwoLetterLanguage = package.TwoLetterLanguage, CrawlerId = crawlerId }; var failed = false; foreach (var downloadModel in package.Documents) { try { var httpGet = downloadModel.HttpGet; if (downloadModel.IsDownloaded == failed) { httpGet = await crawler.Http.GetAsync(downloadModel.Url); } documentGroupModel.Documents.Add(new DocumentModel { Format = httpGet.MimeType, Raw = httpGet.Raw, Name = downloadModel.Name.Value, Url = downloadModel.Url, }); } catch (Exception e) { this.logger.Error(message: $"package name: {package.Name}; url: {downloadModel.Url}", exception: e); failed = true; break; } } package.IsProcessed = true; package.FreeResource(); this.logger.Info($"Processed package: {package.Url}"); if (failed == false) { //var isNewOrUpdated = this.documentGroupManager.AddOrUpdateDocumentGroup(documentGroupModel); //if (isNewOrUpdated) //{ // this.logger.Info(message: $"New or updated: {documentGroupModel.Name}; {documentGroupModel.Identifier}"); //} } }
/// <summary> /// Add or update the speicified document group to the database using the <see cref="DocumentGroupModel.Name"/> property to match if the document group exists in the database /// </summary> /// <param name="crawledDocumentGroup"></param> /// <returns></returns> public bool AddOrUpdateDocumentGroup(DocumentGroupModel crawledDocumentGroup) { ValidateDocumentNames(crawledDocumentGroup); foreach (var crawledDocument in crawledDocumentGroup.Documents) { crawledDocument.Md5 = MD5Hash.GetMd5Hash(crawledDocument.Raw); crawledDocument.Identifier = Guid.NewGuid().ToString(); crawledDocument.Name = crawledDocument.Name.ToLower(); } var folderName = crawledDocumentGroup.Name.ToLower(); var zipFileName = folderName + ".zip"; crawledDocumentGroup.Name = zipFileName; return(this.ProcessDocumentGroup(crawledDocumentGroup)); }
private DocumentGroupModel GetDocumentGroup(string docNumber, DocumentModel nodeDocument, DocumentModel documentModel, DocumentModel xmlDocument) { var documents = new List <DocumentModel> { nodeDocument, documentModel }; if (xmlDocument != null) { documents.Add(xmlDocument); } var documentGroup = new DocumentGroupModel { CrawlerId = this.CrawlerId, TwoLetterLanguage = "DE", Name = docNumber, Documents = documents }; return(documentGroup); }
private bool IsUpdated(DocumentGroupModel crawledDocumentGroup, DocumentGroupModel documentGroupFromDatabase) { var isProcess = false; foreach (var crawledDocument in crawledDocumentGroup.Documents) { var documentInfo = documentGroupFromDatabase.Documents.Where(x => x.Name == crawledDocument.Name).FirstOrDefault(); if (documentInfo != null) { if (documentInfo.Md5 != crawledDocument.Md5 || documentInfo.Operation == DocumentModelOperation.Del || documentInfo.Format != crawledDocument.Format) { crawledDocument.Operation = DocumentModelOperation.Upd; documentInfo.Operation = DocumentModelOperation.Upd; isProcess = true; } else { crawledDocument.Operation = DocumentModelOperation.None; } } else { crawledDocument.Operation = DocumentModelOperation.Add; isProcess = true; } } foreach (var documenInfo in documentGroupFromDatabase.Documents) { if (!crawledDocumentGroup.Documents.Where(x => x.Name == documenInfo.Name).Any()) { documenInfo.Operation = DocumentModelOperation.Del; isProcess = true; } } return(isProcess); }
private void AddDocumentGroup(DocumentGroupModel crawledDocumentGroup) { var newDocumentGroup = new InterlexCrawlerEntities.DocumentGroup(); newDocumentGroup.CrawlerId = crawledDocumentGroup.CrawlerId; newDocumentGroup.Identifier = Guid.NewGuid().ToString(); newDocumentGroup.DocumentGroupName = crawledDocumentGroup.Name; newDocumentGroup.DocumentGroupFormat = "application/zip"; newDocumentGroup.Lang = crawledDocumentGroup.TwoLetterLanguage; newDocumentGroup.Operation = (int)DocumentGroupModelOperation.Add; newDocumentGroup.DocumentGroupDate = DateTime.UtcNow.ToString("yyyy-MM-ddTHH:mm:ss"); // Byte Array Data newDocumentGroup.DataContent = Zip.DocumentGroup(crawledDocumentGroup); this.AddDocuments(newDocumentGroup, crawledDocumentGroup); using (var context = new InterlexCrawlerEntities()) { context.DocumentGroups.Add(newDocumentGroup); context.SaveChanges(); context.PChangeOperationStatus(newDocumentGroup.Identifier, (int)DocumentGroupModelOperation.Add, "DatabaseDocumentManager", null); } }
private bool ProcessDocumentGroup(DocumentGroupModel crawledDocumentGroup) { var isProcessed = false; var databaseDocumentGroup = this.GetDocumentGroupInfo(crawledDocumentGroup.CrawlerId, crawledDocumentGroup.Name); if (databaseDocumentGroup != null) { var isUpdated = this.IsUpdated(crawledDocumentGroup, databaseDocumentGroup); if (isUpdated) { this.UpdateDocumentGroup(crawledDocumentGroup, databaseDocumentGroup); isProcessed = true; } } else { this.AddDocumentGroup(crawledDocumentGroup); isProcessed = true; } return(isProcessed); }
public override Task StartAsync() { using (var context = new InterlexCrawlerEntities()) { foreach (var(id, content) in context.GetNewOrUpdatedInterlexEditorToolLazy()) { try { var documentGroup = new DocumentGroupModel { CrawlerId = this.CrawlerId, Name = id, TwoLetterLanguage = "EU", Documents = { new DocumentModel { Raw = encoding.GetBytes(content), Name = "content", Format = "application/json", Url = "local" } } }; this.DocumentGroupManager.AddOrUpdateDocumentGroup(documentGroup); } catch (Exception e) { this.Logger.Error($"{id}", e); } } } return(Task.CompletedTask); }
private void AddDocuments(InterlexCrawlerEntities.DocumentGroup newDocumentGroup, DocumentGroupModel crawleredDocumentGroup) { var documentOrder = 1; foreach (var crawledDocument in crawleredDocumentGroup.Documents) { var fileLower = crawledDocument.Name.ToLower(); var document = new InterlexCrawlerEntities.Document(); document.DocumentName = fileLower; document.Identifier = crawledDocument.Identifier; document.DocumentFormat = crawledDocument.Format; document.Operation = (int)crawledDocument.Operation; document.DocumentOrder = documentOrder; document.Url = crawledDocument.Url; document.Md5 = crawledDocument.Md5; newDocumentGroup.Documents.Add(document); documentOrder++; } }
private void UpdateDocumentGroup(DocumentGroupModel crawledDocumentGroup, DocumentGroupModel documentGroupFromDatabase) { using (var context = new InterlexCrawlerEntities()) { var documentGroupDb = (from dg in context.DocumentGroups where dg.Identifier == documentGroupFromDatabase.Identifier select dg).Single(); context.Entry(documentGroupDb).Collection(x => x.Documents).Load(); documentGroupDb.Operation = (int)DocumentGroupModelOperation.Upd; documentGroupDb.DataContent = Zip.DocumentGroup(crawledDocumentGroup); documentGroupDb.DocumentGroupDate = DateTime.UtcNow.ToString("yyyy-MM-ddTHH:mm:ss"); documentGroupDb.Lang = crawledDocumentGroup.TwoLetterLanguage; var documentOrder = 0; foreach (var crawledDocument in crawledDocumentGroup.Documents) { if (crawledDocument.Operation == DocumentModelOperation.Add) { string fileLower = crawledDocument.Name.ToLower(); var document = new InterlexCrawlerEntities.Document(); document.DocumentName = fileLower; document.Identifier = crawledDocument.Identifier; document.DocumentFormat = crawledDocument.Format; document.Operation = (int)crawledDocument.Operation; document.DocumentOrder = documentOrder; document.Url = crawledDocument.Url; document.Md5 = crawledDocument.Md5; documentGroupDb.Documents.Add(document); } else if (crawledDocument.Operation == DocumentModelOperation.Upd) { var dbDocument = documentGroupDb.Documents.Where(x => x.DocumentName == crawledDocument.Name).FirstOrDefault(); dbDocument.Operation = (int)crawledDocument.Operation; dbDocument.DocumentOrder = documentOrder; dbDocument.DocumentFormat = crawledDocument.Format; dbDocument.Url = crawledDocument.Url; dbDocument.Md5 = crawledDocument.Md5; } documentOrder++; } foreach (var documenInfo in documentGroupFromDatabase.Documents) { if (documenInfo.Operation == DocumentModelOperation.Del) { var dbDocument = documentGroupDb.Documents.Where(x => x.DocumentName == documenInfo.Name).FirstOrDefault(); if (dbDocument != null) { dbDocument.Operation = (int)documenInfo.Operation; } } } context.SaveChanges(); context.PChangeOperationStatus(documentGroupDb.Identifier, (int)DocumentGroupModelOperation.Upd, "DatabaseDocumentManager", null); } }