public static CrawlerConfiguration FlurlDownload(this CrawlerConfiguration crawlerConfiguration, int?maxDegreeOfParallelism = null) { crawlerConfiguration.AddPipelineStep( new FlurlDownloadPipelineStep(maxDegreeOfParallelism.GetValueOrDefault(Environment.ProcessorCount))); return(crawlerConfiguration); }
public ArribaDirectIndexerItemConsumer(CrawlerConfiguration config) { this.Configuration = config; this.DiagnosticsEnabled = Debugger.IsAttached; this.DiagnosticsLevel = VerificationLevel.Normal; }
public static CrawlerConfiguration DetectLanguage(this CrawlerConfiguration crawlerConfiguration, int?maxDegreeOfParallelism = null) { crawlerConfiguration.AddPipelineStep( new GoogleLanguageDetection(maxDegreeOfParallelism.GetValueOrDefault(Environment.ProcessorCount))); return(crawlerConfiguration); }
public static CrawlerConfiguration PdfTextExtractProcessor(this CrawlerConfiguration crawlerConfiguration, int?maxDegreeOfParallelism = null) { PdfBoxTextExtractorProcessorPipelineStep filterTextExtractorProcessor = new PdfBoxTextExtractorProcessorPipelineStep(maxDegreeOfParallelism.GetValueOrDefault(Environment.ProcessorCount)); crawlerConfiguration.AddPipelineStep(filterTextExtractorProcessor); return(crawlerConfiguration); }
public ArribaClientIndexerItemConsumer(CrawlerConfiguration config, string serviceUrl) { this.Configuration = config; this.ServiceUrl = serviceUrl; // Allow long timeouts (save for huge databases is slow) this.Client = new ArribaClient(new Uri(this.ServiceUrl), TimeSpan.FromMinutes(15)); this.Table = this.Client[this.Configuration.ArribaTable]; }
static void Main() { Application.SetHighDpiMode(HighDpiMode.SystemAware); Application.EnableVisualStyles(); Application.SetCompatibleTextRenderingDefault(false); Program.form = new Form1(); string outputPath = Path.Join(Directory.GetCurrentDirectory(), "output"); if (Directory.Exists(outputPath)) { Directory.Delete(outputPath, true); } Directory.CreateDirectory(outputPath); var configuration = new CrawlerConfiguration() { SaveRobotsFile = true, SaveSitemapFiles = false, SaveUrls = true, DeleteHtmlAfterScrape = true, SerializeSite = true, SerializeGraph = true }; var token = new CancellationTokenSource(); var seedUrls = new Uri[] { new Uri("https://www.google.com/") }; Program.crawler = new Crawler(configuration, seedUrls, outputPath, token.Token); Program.RegisterEvents(); var task = Task.Run(() => { Program.crawler.Crawl(); }); Application.Run(Program.form); token.Cancel(); task.Wait(); token.Dispose(); }
public TfsItemProvider(CrawlerConfiguration config) { this.DatabaseUri = config.ItemDatabaseName; this.Query = config.ItemQuery ?? DefaultQuery; this.ColumnMappings = config.ColumnMappings; this.SerializerSettings = new JsonSerializerSettings(); this.SerializerSettings.Converters.Add(new AttachmentCollectionJsonConverter()); this.SerializerSettings.Converters.Add(new LinkCollectionJsonConverter()); this.SerializerSettings.Converters.Add(new RevisionCollectionJsonConverter()); // TODO: Figure out Personal Access Tokens (may only work for REST APIs) // https://www.visualstudio.com/en-us/docs/setup-admin/team-services/use-personal-access-tokens-to-authenticate // Connect to TFS, using encrypted credentials if found or the current user identity otherwise switch ((config.AuthenticationMode ?? String.Empty).ToLowerInvariant()) { case "aad": // https://www.visualstudio.com/en-us/docs/setup-admin/team-services/manage-organization-access-for-your-account-vs Trace.WriteLine(string.Format("Connecting to '{0}' [AAD]...", this.DatabaseUri)); this.Store = new WorkItemStore(new TfsTeamProjectCollection(new Uri(this.DatabaseUri), new VssAadCredential())); break; // Deprecated, but works. //case "alternate": // Trace.WriteLine(string.Format("Connecting to '{0}' [Alternate]...", this.DatabaseUri)); // string encryptedBase64Password = File.ReadAllText(string.Format(EncryptedPasswordFilePathFormat, config.ConfigurationName)); // string unprotectedPassword = DecryptLocalUserPassword(encryptedBase64Password); // //var credential = new TfsClientCredentials(new Microsoft.TeamFoundation.Client.WindowsCredential(new NetworkCredential(config.UserName, unprotectedPassword))); // //credential.AllowInteractive = false; // var tpc = new TfsTeamProjectCollection(new Uri(this.DatabaseUri), credential); // this.Store = new WorkItemStore(tpc); // break; case "integrated": case "": Trace.WriteLine(string.Format("Connecting to '{0}' [Integrated Auth]...", this.DatabaseUri)); this.Store = new WorkItemStore(this.DatabaseUri); break; default: throw new NotImplementedException(string.Format("TfsItemProvider has no implementation for authenticationMode \"{0}\". Use 'aad', 'token', or 'integrated'.", config.AuthenticationMode)); } // Debug Only: Get the fields list //IList<string> fields = GetStoreFields(); //File.WriteAllLines(string.Format("{0}.Fields.txt", config.ArribaTable), fields); }
public static IItemProvider Build(CrawlerConfiguration config) { if (config == null) { throw new ArgumentNullException("config", "config is null."); } switch (config.ItemProvider.ToLowerInvariant()) { case "": case "tfsitemprovider": return(new TfsItemProvider(config)); default: throw new InvalidOperationException(String.Format("{0} is an unknown Item Provider", config.ItemProvider)); } }
public static int EncryptPassword(CrawlerConfiguration config) { Console.Write("Enter TFS Online Password to local user encrypt: "); string password = Console.ReadLine(); if (String.IsNullOrEmpty(password)) { return(-1); } string encryptedPasswordPath = string.Format(TfsItemProvider.EncryptedPasswordFilePathFormat, config.ConfigurationName); string encryptedPassword = TfsItemProvider.LocalUserEncryptPassword(password); File.WriteAllText(encryptedPasswordPath, encryptedPassword); Console.WriteLine("Encrypted Password written to '{0}'. Run Crawler to test.", encryptedPasswordPath); Console.WriteLine(); return(0); }
public IActionResult AddDouBanGroup([FromBody] JToken model) { string doubanGroup = model?["groupId"].ToString(); string cityName = model?["cityName"].ToString(); if (string.IsNullOrEmpty(doubanGroup) || string.IsNullOrEmpty(cityName)) { return(Ok(new { success = false, error = "请输入豆瓣小组Group和城市名称。" })); } var topics = DoubanHouseCrawler.GetHouseData(doubanGroup, cityName, 1); if (topics != null && topics.Count() > 0) { var cityInfo = $"{{ 'groupid':'{doubanGroup}','cityname':'{cityName}','pagecount':5}}"; var doubanConfig = new CrawlerConfiguration(); if (doubanConfig != null) { return(Ok(new { success = true })); } var config = new CrawlerConfiguration() { ConfigurationKey = 0, ConfigurationValue = cityInfo, ConfigurationName = ConstConfigName.Douban, DataCreateTime = DateTime.Now, IsEnabled = true, }; configurationDapper.Insert(config); return(Ok(new { success = true })); } else { return(Ok(new { success = false, error = "保存失败!请检查豆瓣小组ID(如:XMhouse)/城市名称(如:厦门)是否正确..." })); } }
public static IItemConsumer Build(CrawlerConfiguration config) { if (config == null) { throw new ArgumentNullException("config", "config is null."); } switch (config.ItemConsumer.ToLowerInvariant()) { case "": case "arribaclient": return(new ArribaClientIndexerItemConsumer(config, config.ArribaServiceUrl ?? "http://localhost:42784")); case "arribadirect": return(new ArribaDirectIndexerItemConsumer(config)); case "csvwriter": return(new CsvWriterItemConsumer(config.ArribaTable, "Changed Date")); default: throw new InvalidOperationException(String.Format("{0} is an unknown Item Consumer", config.ItemConsumer)); } }
public Crawler(CrawlerConfiguration configuration) { // TODO: Implement this and delete the other .ctor }
public Crawler(CrawlerConfiguration configuration) : this() { _configuration = configuration; }
public static CrawlerConfiguration Robots(this CrawlerConfiguration crawlerConfiguration, string searchPath = null) { crawlerConfiguration.AddPipelineStep(new RobotsPipelineStep(searchPath, crawlerConfiguration.Logger)); return(crawlerConfiguration); }