/// <summary> /// For each downloaded document start the process of checking the differences. /// Save and send the document to the service, if there is a differences. /// </summary> /// <param name="crawlerClass">Accept BaseCrawler <see cref="BaseCrawler"/> class.</param> public void Start(BaseCrawler crawlerClass) { this.counter = 0; this.context = new SQLiteCrawlerDataEntities(Arguments.SQLiteConnectionString); this.context.Configuration.ValidateOnSaveEnabled = false; crawlerClass.OnDocDownloaded += this.CrawlerClass_OnDocDownloaded; crawlerClass.Start(); if (this.counter > 0) { try { this.context.SaveChanges(); } catch (Exception ex) { CrawlerLog.LogException(ex); } } this.UploadToService(); }
/// <summary> /// Take a decision whether a document is to update, add or delete. /// The decision is based on the document MetaInfoto on each document and the differences in MD5 sum. /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void CrawlerClass_OnDocDownloaded(object sender, EventArgs e) { var filesMetaInfo = (e as DocDownloadedEventArguments).Document.ToList(); string folderName = (e as DocDownloadedEventArguments).Folder.ToLower(); string zipFileName = folderName + ".zip"; string docDir = Path.Combine(Arguments.DataFolder, folderName); bool isProcess = false; DocumentGroup documentGroupDb = (from d in this.context.DocumentGroups.Include("Documents") where d.DocumentGroupName == zipFileName select d).FirstOrDefault(); if (documentGroupDb != null) { // Upd DocumentGroup isProcess = this.UpdateDocumentGroup(filesMetaInfo, documentGroupDb); } else { // Add DocumentGroup isProcess = this.AddDocumentGroup(filesMetaInfo, zipFileName, this.context); } // Del??? if (isProcess) { // Create zip this.MemoryZip(docDir, filesMetaInfo); this.counter++; // Add/Upd this DocumentGroups in database if (this.counter > EtityWriteSize) { try { this.context.SaveChanges(); } catch (Exception ex) { CrawlerLog.LogException(ex); } this.context.Dispose(); this.context = new SQLiteCrawlerDataEntities(Arguments.SQLiteConnectionString); this.context.Configuration.ValidateOnSaveEnabled = false; this.counter = 0; } } }
/// <summary> /// Add new DocumentGroup /// </summary> /// <param name="filesMetaInfo"></param> /// <param name="docDir"></param> /// <param name="zipFileName"></param> /// <param name="context"></param> /// <returns></returns> private bool AddDocumentGroup( List<XmlDocumentMetaInfo> filesMetaInfo, string zipFileName, SQLiteCrawlerDataEntities context) { bool isProcess = false; // Add DocumentGroup DocumentGroup newDocumentGroup = new DocumentGroup(); newDocumentGroup.DocumentGroupDate = DateTime.UtcNow.ToString("yyyy-MM-ddTHH:mm:ss"); newDocumentGroup.DocumentGroupName = zipFileName.ToLower(); newDocumentGroup.Identifier = Guid.NewGuid().ToString(); newDocumentGroup.Operation = (int)Operation.Add; foreach (var fileMetaInfo in filesMetaInfo) { fileMetaInfo.Md5 = MD5HashHelper.GetMd5Hash(fileMetaInfo.DataContent); fileMetaInfo.Operation = Operation.Add; fileMetaInfo.Identifier = Guid.NewGuid().ToString(); isProcess = true; Document document = this.ReturnNewDocument(fileMetaInfo); newDocumentGroup.Documents.Add(document); } context.DocumentGroups.Add(newDocumentGroup); return isProcess; }
/// <summary> /// For each document group sent to the service metinfo /// for each document together with his data represented such as byte array. /// </summary> private void UploadToService() { Stopwatch sw = Stopwatch.StartNew(); try { // fetch all documentgroup names for upload List<string> docGroupNames; using (var context = new SQLiteCrawlerDataEntities(Arguments.SQLiteConnectionString)) { docGroupNames = (from dg in context.DocumentGroups where dg.Operation != (int)Operation.None select dg.DocumentGroupName).ToList(); } List<long> docGroupIds = new List<long>(EntityUploadSize); Parallel.ForEach( docGroupNames, new ParallelOptions { MaxDegreeOfParallelism = 5 }, docGroupName => { using (var context = new SQLiteCrawlerDataEntities(Arguments.SQLiteConnectionString)) { DocumentGroup documentGroup = (from d in context.DocumentGroups.Include("Documents") where d.DocumentGroupName == docGroupName select d).FirstOrDefault(); using (ServiceContractor sc = new ServiceContractor()) { bool isNormalUpload = sc.UploadToService(documentGroup); if (isNormalUpload) { lock (objectLock) { docGroupIds.Add(documentGroup.DocumentGroupId); if (docGroupIds.Count() > EntityUploadSize) { UpdateSqLiteDB(docGroupIds); docGroupIds = new List<long>(EntityUploadSize); } } } } } }); this.UpdateSqLiteDB(docGroupIds); } finally { sw.Stop(); } CrawlerLog.LogInfo("upload time in ms:" + sw.ElapsedMilliseconds); }