private async Task <IteratorItem> CollectFileMetadata(IteratorItem item, FileInfo info) { try { item.Metadata.Add(new MetadataItem("Created_At", info.CreationTimeUtc.ToString())); item.Metadata.Add(new MetadataItem("Last_Accessed_At", info.LastAccessTimeUtc.ToString())); item.Metadata.Add(new MetadataItem("Updated_At", info.LastWriteTimeUtc.ToString())); item.Metadata.Add(new MetadataItem("Size", info.Length.ToString())); FileSecurity fs = info.GetAccessControl(AccessControlSections.Access); foreach (FileSystemAccessRule ar in fs.GetAccessRules(true, true, typeof(System.Security.Principal.NTAccount))) { item.Metadata.Add(new MetadataItem("Access_Rights", ar.IdentityReference.Value, ar.FileSystemRights.ToString())); } } catch (Exception ex) { Log.Error(ex, "Failed to collect metadata: " + info.FullName); } return(item); }
public async Task <ScanResult> ScanNext(Repository repo, IteratorItem item) { ScanResult retVal = null; string extention = ""; Log.Trace("Started processing: " + item.DataObjectIdentifier + ", Thread Id: " + Thread.CurrentThread.ManagedThreadId); try { if (item == null) { return(null); } Dictionary <string, List <string> > identifiers = null; FileInfo file = new FileInfo(item.DataObjectIdentifier); extention = file.Extension; // Don't scan files that are too big if (file.Length > _maxFileSize * 1000000) { Log.Warn("File too big:" + file.FullName + ", Size:" + file.Length); Counter.Add("file_too_big", 1); } else { // Get text of the file string txt = await FileUtils.Parse(file); if (txt != null && txt.Length > 0) { //Do NER identifiers = await NER.Parse(txt, file.FullName); if (identifiers != null && identifiers.Count > 0) { Counter.Add("found_pi", 1); // Get metadata item = await CollectFileMetadata(item, file); retVal = new ScanResult() { DataObjectIdentifier = item.DataObjectIdentifier, Identifiers = identifiers, Metadata = item.Metadata, RepositoryId = repo.id }; // Store results await SMBDal.AddDataObjectForMatching(retVal); } else { Counter.Add("didnot_find_pi", 1); } } Log.Info("Processed file:" + file.FullName + ", Identifiers: " + (identifiers == null ? "0" : identifiers.Count.ToString())); } } catch (Exception ex) { Log.Error(ex, "Failed to process file: " + item.DataObjectIdentifier); return(null); } Log.Trace("Finished processing: " + retVal.DataObjectIdentifier + ", Thread Id: " + Thread.CurrentThread.ManagedThreadId); Counter.Add(extention, 1); return(retVal); }
public async Task <ScanResult> ScanNext(Repository repo) { IteratorItem item = await SMBIterator.NextItemAsync(); return(await ScanNext(repo, item)); }
public async Task <bool> Scan(Repository repo) { bool scanDone = false; var postTaskTasks = new List <PFTask>(); object SpinLock = new object(); PrepCredentials(repo); using (var throttler = new SemaphoreSlim(_maxFilesScanCocur)) { // Scan files while (!scanDone) { bool semValue = await throttler.WaitAsync(_taskTimeout); if (!semValue) { // See what tasks are hanging for more than 30 seconds and kill them foreach (PFTask t in postTaskTasks) { if (t.StartTime.Subtract(DateTime.UtcNow).Seconds > _taskTimeout) { t.TokenSource.Cancel(); } } } IteratorItem item = null; lock (SpinLock) { // Get next item for iteration item = SMBIterator.NextItem(); } if (item != null) { // process the next item PFTask tsk = new PFTask(); tsk.Task = Task.Run <ScanResult>(() => ScanNext(repo, item), tsk.CancellationToken).ContinueWith(t => release(throttler)); postTaskTasks.Add(tsk); } else { scanDone = true; } // Clean the completed tasks from the wait array foreach (PFTask t in postTaskTasks) { if (t.Task.IsCompleted || t.Task.IsCanceled || t.Task.IsFaulted) { _tasksToRemove.Add(t); } } foreach (PFTask t in _tasksToRemove) { postTaskTasks.Remove(t); t.Task.Dispose(); } _tasksToRemove = new List <PFTask>(); if (scanDone) { // Wait for all the tasks to finish before exiting Task.WaitAll(postTaskTasks.Select(i => i.Task).ToArray(), 30000); } } } Counter.PrintAll(); return(true); }