コード例 #1
0
ファイル: Zip.cs プロジェクト: csu-xiao-an/InterLex-Project
        /// <summary>
        /// Returns zip representation of the provided documents content (check: <see cref="DocumentModel.Raw"/>)
        /// </summary>
        /// <param name="documentGroup">Document group with documents to be zipped</param>
        /// <returns></returns>
        public static byte[] DocumentGroup(DocumentGroupModel documentGroup)
        {
            using (var outputMemStream = new MemoryStream())
                using (var s = new ZipOutputStream(outputMemStream))
                {
                    s.SetLevel(9); // 0-9, 9 being the highest compression
                    byte[] buffer = new byte[4096];
                    foreach (var doc in documentGroup.Documents)
                    {
                        var entry = new ZipEntry(Path.GetFileName(doc.Name));

                        // entry.DateTime = DateTime.Now;
                        s.PutNextEntry(entry);
                        using (var ms = new MemoryStream(doc.Raw, 0, doc.Raw.Length))
                        {
                            int sourceBytes;
                            do
                            {
                                sourceBytes = ms.Read(buffer, 0, buffer.Length);
                                s.Write(buffer, 0, sourceBytes);
                            }while (sourceBytes > 0);
                        }
                    }

                    s.Finish();

                    return(outputMemStream.ToArray());
                }
        }
コード例 #2
0
        private static void ValidateDocumentNames(DocumentGroupModel crawledDocumentGroup)
        {
            // check that all names are unique
            var areUnique = crawledDocumentGroup.Documents.Count == crawledDocumentGroup.Documents.Select(x => x.Name).Distinct().Count();

            if (areUnique == false)
            {
                throw new ArgumentException($"Document names must be unique! Document group name={crawledDocumentGroup.Name}");
            }
        }
コード例 #3
0
        private async Task AddOrUpdatePackageToDatabase(BaseHttpCrawler crawler, CrawlerDownloadPackageModel package, int crawlerId)
        {
            var documentGroupModel = new DocumentGroupModel
            {
                Name = package.Name.Value,
                TwoLetterLanguage = package.TwoLetterLanguage,
                CrawlerId         = crawlerId
            };

            var failed = false;

            foreach (var downloadModel in package.Documents)
            {
                try
                {
                    var httpGet = downloadModel.HttpGet;
                    if (downloadModel.IsDownloaded == failed)
                    {
                        httpGet = await crawler.Http.GetAsync(downloadModel.Url);
                    }

                    documentGroupModel.Documents.Add(new DocumentModel
                    {
                        Format = httpGet.MimeType,
                        Raw    = httpGet.Raw,
                        Name   = downloadModel.Name.Value,
                        Url    = downloadModel.Url,
                    });
                }
                catch (Exception e)
                {
                    this.logger.Error(message: $"package name: {package.Name}; url: {downloadModel.Url}", exception: e);
                    failed = true;
                    break;
                }
            }

            package.IsProcessed = true;
            package.FreeResource();
            this.logger.Info($"Processed package: {package.Url}");

            if (failed == false)
            {
                //var isNewOrUpdated = this.documentGroupManager.AddOrUpdateDocumentGroup(documentGroupModel);
                //if (isNewOrUpdated)
                //{
                //    this.logger.Info(message: $"New or updated: {documentGroupModel.Name}; {documentGroupModel.Identifier}");
                //}
            }
        }
コード例 #4
0
        /// <summary>
        /// Add or update the speicified document group to the database using the <see cref="DocumentGroupModel.Name"/> property to match if the document group exists in the database
        /// </summary>
        /// <param name="crawledDocumentGroup"></param>
        /// <returns></returns>
        public bool AddOrUpdateDocumentGroup(DocumentGroupModel crawledDocumentGroup)
        {
            ValidateDocumentNames(crawledDocumentGroup);

            foreach (var crawledDocument in crawledDocumentGroup.Documents)
            {
                crawledDocument.Md5        = MD5Hash.GetMd5Hash(crawledDocument.Raw);
                crawledDocument.Identifier = Guid.NewGuid().ToString();
                crawledDocument.Name       = crawledDocument.Name.ToLower();
            }

            var folderName  = crawledDocumentGroup.Name.ToLower();
            var zipFileName = folderName + ".zip";

            crawledDocumentGroup.Name = zipFileName;

            return(this.ProcessDocumentGroup(crawledDocumentGroup));
        }
コード例 #5
0
        private DocumentGroupModel GetDocumentGroup(string docNumber, DocumentModel nodeDocument, DocumentModel documentModel, DocumentModel xmlDocument)
        {
            var documents = new List <DocumentModel> {
                nodeDocument, documentModel
            };

            if (xmlDocument != null)
            {
                documents.Add(xmlDocument);
            }

            var documentGroup = new DocumentGroupModel
            {
                CrawlerId         = this.CrawlerId,
                TwoLetterLanguage = "DE",
                Name      = docNumber,
                Documents = documents
            };

            return(documentGroup);
        }
コード例 #6
0
        private bool IsUpdated(DocumentGroupModel crawledDocumentGroup, DocumentGroupModel documentGroupFromDatabase)
        {
            var isProcess = false;

            foreach (var crawledDocument in crawledDocumentGroup.Documents)
            {
                var documentInfo = documentGroupFromDatabase.Documents.Where(x => x.Name == crawledDocument.Name).FirstOrDefault();

                if (documentInfo != null)
                {
                    if (documentInfo.Md5 != crawledDocument.Md5 || documentInfo.Operation == DocumentModelOperation.Del || documentInfo.Format != crawledDocument.Format)
                    {
                        crawledDocument.Operation = DocumentModelOperation.Upd;
                        documentInfo.Operation    = DocumentModelOperation.Upd;

                        isProcess = true;
                    }
                    else
                    {
                        crawledDocument.Operation = DocumentModelOperation.None;
                    }
                }
                else
                {
                    crawledDocument.Operation = DocumentModelOperation.Add;
                    isProcess = true;
                }
            }

            foreach (var documenInfo in documentGroupFromDatabase.Documents)
            {
                if (!crawledDocumentGroup.Documents.Where(x => x.Name == documenInfo.Name).Any())
                {
                    documenInfo.Operation = DocumentModelOperation.Del;
                    isProcess             = true;
                }
            }

            return(isProcess);
        }
コード例 #7
0
        private void AddDocumentGroup(DocumentGroupModel crawledDocumentGroup)
        {
            var newDocumentGroup = new InterlexCrawlerEntities.DocumentGroup();

            newDocumentGroup.CrawlerId           = crawledDocumentGroup.CrawlerId;
            newDocumentGroup.Identifier          = Guid.NewGuid().ToString();
            newDocumentGroup.DocumentGroupName   = crawledDocumentGroup.Name;
            newDocumentGroup.DocumentGroupFormat = "application/zip";
            newDocumentGroup.Lang              = crawledDocumentGroup.TwoLetterLanguage;
            newDocumentGroup.Operation         = (int)DocumentGroupModelOperation.Add;
            newDocumentGroup.DocumentGroupDate = DateTime.UtcNow.ToString("yyyy-MM-ddTHH:mm:ss");

            // Byte Array Data
            newDocumentGroup.DataContent = Zip.DocumentGroup(crawledDocumentGroup);
            this.AddDocuments(newDocumentGroup, crawledDocumentGroup);

            using (var context = new InterlexCrawlerEntities())
            {
                context.DocumentGroups.Add(newDocumentGroup);
                context.SaveChanges();
                context.PChangeOperationStatus(newDocumentGroup.Identifier, (int)DocumentGroupModelOperation.Add, "DatabaseDocumentManager", null);
            }
        }
コード例 #8
0
        private bool ProcessDocumentGroup(DocumentGroupModel crawledDocumentGroup)
        {
            var isProcessed = false;

            var databaseDocumentGroup = this.GetDocumentGroupInfo(crawledDocumentGroup.CrawlerId, crawledDocumentGroup.Name);

            if (databaseDocumentGroup != null)
            {
                var isUpdated = this.IsUpdated(crawledDocumentGroup, databaseDocumentGroup);

                if (isUpdated)
                {
                    this.UpdateDocumentGroup(crawledDocumentGroup, databaseDocumentGroup);
                    isProcessed = true;
                }
            }
            else
            {
                this.AddDocumentGroup(crawledDocumentGroup);
                isProcessed = true;
            }

            return(isProcessed);
        }
コード例 #9
0
        public override Task StartAsync()
        {
            using (var context = new InterlexCrawlerEntities())
            {
                foreach (var(id, content) in context.GetNewOrUpdatedInterlexEditorToolLazy())
                {
                    try
                    {
                        var documentGroup = new DocumentGroupModel
                        {
                            CrawlerId         = this.CrawlerId,
                            Name              = id,
                            TwoLetterLanguage = "EU",
                            Documents         =
                            {
                                new DocumentModel
                                {
                                    Raw    = encoding.GetBytes(content),
                                    Name   = "content",
                                    Format = "application/json",
                                    Url    = "local"
                                }
                            }
                        };

                        this.DocumentGroupManager.AddOrUpdateDocumentGroup(documentGroup);
                    }
                    catch (Exception e)
                    {
                        this.Logger.Error($"{id}", e);
                    }
                }
            }

            return(Task.CompletedTask);
        }
コード例 #10
0
        private void AddDocuments(InterlexCrawlerEntities.DocumentGroup newDocumentGroup, DocumentGroupModel crawleredDocumentGroup)
        {
            var documentOrder = 1;

            foreach (var crawledDocument in crawleredDocumentGroup.Documents)
            {
                var fileLower = crawledDocument.Name.ToLower();
                var document  = new InterlexCrawlerEntities.Document();
                document.DocumentName   = fileLower;
                document.Identifier     = crawledDocument.Identifier;
                document.DocumentFormat = crawledDocument.Format;
                document.Operation      = (int)crawledDocument.Operation;
                document.DocumentOrder  = documentOrder;
                document.Url            = crawledDocument.Url;
                document.Md5            = crawledDocument.Md5;

                newDocumentGroup.Documents.Add(document);

                documentOrder++;
            }
        }
コード例 #11
0
        private void UpdateDocumentGroup(DocumentGroupModel crawledDocumentGroup, DocumentGroupModel documentGroupFromDatabase)
        {
            using (var context = new InterlexCrawlerEntities())
            {
                var documentGroupDb = (from dg in context.DocumentGroups
                                       where dg.Identifier == documentGroupFromDatabase.Identifier
                                       select dg).Single();

                context.Entry(documentGroupDb).Collection(x => x.Documents).Load();

                documentGroupDb.Operation         = (int)DocumentGroupModelOperation.Upd;
                documentGroupDb.DataContent       = Zip.DocumentGroup(crawledDocumentGroup);
                documentGroupDb.DocumentGroupDate = DateTime.UtcNow.ToString("yyyy-MM-ddTHH:mm:ss");
                documentGroupDb.Lang = crawledDocumentGroup.TwoLetterLanguage;

                var documentOrder = 0;
                foreach (var crawledDocument in crawledDocumentGroup.Documents)
                {
                    if (crawledDocument.Operation == DocumentModelOperation.Add)
                    {
                        string fileLower = crawledDocument.Name.ToLower();
                        var    document  = new InterlexCrawlerEntities.Document();
                        document.DocumentName   = fileLower;
                        document.Identifier     = crawledDocument.Identifier;
                        document.DocumentFormat = crawledDocument.Format;
                        document.Operation      = (int)crawledDocument.Operation;
                        document.DocumentOrder  = documentOrder;
                        document.Url            = crawledDocument.Url;
                        document.Md5            = crawledDocument.Md5;

                        documentGroupDb.Documents.Add(document);
                    }
                    else if (crawledDocument.Operation == DocumentModelOperation.Upd)
                    {
                        var dbDocument = documentGroupDb.Documents.Where(x => x.DocumentName == crawledDocument.Name).FirstOrDefault();
                        dbDocument.Operation      = (int)crawledDocument.Operation;
                        dbDocument.DocumentOrder  = documentOrder;
                        dbDocument.DocumentFormat = crawledDocument.Format;
                        dbDocument.Url            = crawledDocument.Url;
                        dbDocument.Md5            = crawledDocument.Md5;
                    }

                    documentOrder++;
                }

                foreach (var documenInfo in documentGroupFromDatabase.Documents)
                {
                    if (documenInfo.Operation == DocumentModelOperation.Del)
                    {
                        var dbDocument = documentGroupDb.Documents.Where(x => x.DocumentName == documenInfo.Name).FirstOrDefault();
                        if (dbDocument != null)
                        {
                            dbDocument.Operation = (int)documenInfo.Operation;
                        }
                    }
                }


                context.SaveChanges();

                context.PChangeOperationStatus(documentGroupDb.Identifier, (int)DocumentGroupModelOperation.Upd, "DatabaseDocumentManager", null);
            }
        }