public FileViewModel(CrawlingContext model, DispatcherQueue dispatcherQueue) { Model = model; Resources = model.Resources .Select(x => new ResourceViewModel(x, dispatcherQueue)) .Dispatch(dispatcherQueue); var orderedResources = Resources.OrderBy(x => x.HttpStatus).ThenBy(x => x.URL); var processedResources = orderedResources.Where(x => x.Status == ResourceStatus.Processed); Nodes = new ObservableCollection <Node> { new Node("Resources", true, BitmapIcons.Folder, null, new Node("By processing status", true, BitmapIcons.Folder, orderedResources, EnumHelper <ResourceStatus> .Values .Select(status => CreateNodeSplitByDomain(DescriptionExtractor.GetDescription(status), BitmapIcons.WorkStatuses.TryGetValue(status), orderedResources.Where(x => x.Status == status))) .ToArray()), new Node("By http status", true, BitmapIcons.Folder, processedResources, from resource in processedResources group resource by resource.HttpStatus into grp orderby grp.Key select new Node(DescriptionExtractor.GetDescription(grp.Key), false, BitmapIcons.GetImageFromHttpStatus(grp.Key), grp, SplitByDomain(grp))), new Node("By bucket", true, BitmapIcons.Folder, null, from resource in Resources group resource by resource.CurrentBucket ?? "" into grp where grp.Key != "" orderby grp.Count descending, grp.Key select new Node(grp.Key, false, BitmapIcons.Folder, grp))) }; Model.PropertyChanged += Model_PropertyChanged; }
public Crawler(CrawlingContext context, int?limitResources = null) { Context = context; Resources = Context.Resources.ToDictionary(x => x.Url.AbsoluteUri); _limitResources = limitResources; WorkDispatcher = new WorkDispatcher <Resource, BucketContext>(ProcessResource); WorkDispatcher.PropertyChanged += TrackerOnPropertyChanged; }
private void AddFile(CrawlingContext model) { DispatcherQueue queue = new DispatcherQueue(Dispatcher); FileViewModel viewModel = new FileViewModel(model, queue); Model.Files.Add(viewModel); Model.SelectedFile = viewModel; }
public static CrawlingContext DeserializeContext(XElement resourcesNode) { CrawlingContext context = new CrawlingContext(); context.Resources.AddRange(DeserializeResourceCollection(resourcesNode)); return(context); }
private void CrawlingPlay_Executed(object sender, ExecutedRoutedEventArgs e) { if (Model.SelectedConfig == null) { return; } CrawlingContext context = Model.SelectedFile.Model; int nbRetry = Int32.Parse(ConfigurationManager.AppSettings["NbRetry"]); context.Crawler = new Crawler(context, null); context.Crawler.OnCompleted += (_, __) => { context.Status = CrawlingStatus.Ready; context.Crawler = null; RefreshActions(); }; Dictionary <CrawlingBucket, WorkBucket <Resource, BucketContext> > bucketMapping = new Dictionary <CrawlingBucket, WorkBucket <Resource, BucketContext> >(); foreach (CrawlingBucket bucket in Model.SelectedConfig.Buckets) { BucketContext bucketContext = new BucketContext(nbRetry); foreach (CrawlingHostMapping mapping in bucket.HostMappings) { bucketContext.Hosts[mapping.Host] = mapping.IPAddress; } WorkBucket <Resource, BucketContext> workBucket = context.Crawler.AddBucket(bucket.Name, bucket.NbThreads, bucketContext); bucketMapping.Add(bucket, workBucket); } foreach (CrawlingRule rule in Model.SelectedConfig.Rules) { WorkBucket <Resource, BucketContext> workBucket = bucketMapping[rule.TargetBucket]; context.Crawler.AddBehaviorRule(rule.Name, rule.Behavior, workBucket, rule.Conditions); } context.Crawler.Reprocess(); //foreach (ResourceToProcess resourceToProcess in config.) // context.Crawler.AddUrlToProcess(resourceToProcess.Url); foreach (CrawlingStartingUrl startingUrl in Model.SelectedConfig.StartingUrls) { context.Crawler.AddUrlToProcess(startingUrl.Value); } if (context.Crawler.WorkDispatcher.IsWorking) { context.Status = CrawlingStatus.Processing; RefreshActions(); } else { MessageBox.Show(this, "Nothing to do", "Information", MessageBoxButton.OK, MessageBoxImage.Information); } }
private void CrawlingStop_Executed(object sender, ExecutedRoutedEventArgs e) { CrawlingContext context = Model.SelectedFile.Model; if (context.Crawler.WorkDispatcher.IsWorking) { context.Status = CrawlingStatus.Stopping; Crawler crawler = context.Crawler; crawler.Stop(); RefreshActions(); } }
private void SaveFile(CrawlingContext file, bool saveAs) { FileInfo target = null; if (file.FullPath == null || saveAs) { // Configure save file dialog box SaveFileDialog dlg = new SaveFileDialog(); dlg.FileName = file.FullPath == null ? "CrawlingResult.cwl" : file.FullPath.FullName; // Default file name dlg.DefaultExt = ".cwl"; // Default file extension dlg.Filter = "Crawling result (.cwl)|*.cwl"; // Filter files by extension // Process save file dialog box results if (dlg.ShowDialog() == true) { target = new FileInfo(dlg.FileName); } else { return; } } else { target = file.FullPath; } if (target != null) { file.Status = CrawlingStatus.Saving; ThreadPool.QueueUserWorkItem(_ => { try { XElement element = ResourcesSerializer.SerializeResourceCollection(file.Resources); element.Save(target.FullName); file.FullPath = target; file.HasChanged = false; } catch (Exception e) { Dispatcher.Invoke((Action)(() => MessageBox.Show("Failed to save file " + target.Name + ": " + e.Message))); } finally { file.Status = CrawlingStatus.Ready; RefreshActions(); } }); } }
private void ImportUrls_Executed(object sender, ExecutedRoutedEventArgs e) { CrawlingContext context = Model.SelectedFile.Model; if (context.Status == CrawlingStatus.Ready) { List <Uri> urls = ImportUrlsWindow.PromptUrls(this); HashSet <string> existingUrls = new HashSet <string>(context.Resources.Select(x => x.Url.AbsoluteUri)); foreach (Uri url in urls) { Uri urlWithoutSessionId = url.WithoutSession(); if (!existingUrls.Contains(urlWithoutSessionId.AbsoluteUri)) { Resource resource = new Resource(urlWithoutSessionId, ResourceBehavior.Ignore); resource.Status = ResourceStatus.ReadyToProcess; context.Resources.Add(resource); existingUrls.Add(urlWithoutSessionId.AbsoluteUri); } } } }
private void LoadFile(FileInfo file) { CrawlingContext context = new CrawlingContext(); context.FullPath = new FileInfo(file.FullName); context.HasChanged = false; context.Status = CrawlingStatus.Loading; AddFile(context); ThreadPool.QueueUserWorkItem(_ => { XDocument doc = XDocument.Load(file.FullName); List <Resource> resources = ResourcesSerializer.DeserializeResourceCollection(doc.Root); foreach (Resource resource in resources) { context.Resources.Add(resource); } context.Status = CrawlingStatus.Ready; RefreshActions(); }); }
private void ReportingGenerate_Executed(object sender, ExecutedRoutedEventArgs e) { CrawlingContext context = Model.SelectedFile.Model; string defaultFileName = context.FullPath != null?Path.GetFileNameWithoutExtension(context.FullPath.FullName) : "Report"; // Configure save file dialog box SaveFileDialog dlg = new SaveFileDialog(); dlg.FileName = defaultFileName; // Default file name dlg.DefaultExt = ".xlsx"; // Default file extension dlg.Filter = "Excel document (.xlsx)|*.xlsx"; // Filter files by extension // Process save file dialog box results if (dlg.ShowDialog() == true) { context.Status = CrawlingStatus.GeneratingReport; ThreadPool.QueueUserWorkItem(_ => { ExcelPackage pkg = new ExcelPackage(); ReportSerializer.AddReport(pkg, ReportGenerator.GenerateRequestsReport(context.Resources)); ReportSerializer.AddReport(pkg, ReportGenerator.GenerateCachingReport(context.Resources)); ReportSerializer.AddReport(pkg, ReportGenerator.GenerateReferenceSummaryReport(context.Resources)); ReportSerializer.AddReport(pkg, ReportGenerator.GenerateReferencesReport(context.Resources)); ReportSerializer.AddReport(pkg, ReportGenerator.GenerateRedirectionReport(context.Resources)); ReportSerializer.AddReport(pkg, ReportGenerator.GenerateContentReport(context.Resources)); ReportSerializer.AddReport(pkg, ReportGenerator.GenerateErrorsReport(context.Resources)); //foreach (ReportConfig reportConfig in Model.ReportConfigs) // ReportSerializer.AddReport(pkg, ReportGenerator.GenerateReport(reportConfig, context.Resources)); pkg.SaveAs(new FileInfo(dlg.FileName)); context.Status = CrawlingStatus.Ready; RefreshActions(); }); } }