예제 #1
0
 public static CrawlerConfiguration FlurlDownload(this CrawlerConfiguration crawlerConfiguration,
                                                  int?maxDegreeOfParallelism = null)
 {
     crawlerConfiguration.AddPipelineStep(
         new FlurlDownloadPipelineStep(maxDegreeOfParallelism.GetValueOrDefault(Environment.ProcessorCount)));
     return(crawlerConfiguration);
 }
예제 #2
0
        public ArribaDirectIndexerItemConsumer(CrawlerConfiguration config)
        {
            this.Configuration = config;

            this.DiagnosticsEnabled = Debugger.IsAttached;
            this.DiagnosticsLevel   = VerificationLevel.Normal;
        }
예제 #3
0
 public static CrawlerConfiguration DetectLanguage(this CrawlerConfiguration crawlerConfiguration,
                                                   int?maxDegreeOfParallelism = null)
 {
     crawlerConfiguration.AddPipelineStep(
         new GoogleLanguageDetection(maxDegreeOfParallelism.GetValueOrDefault(Environment.ProcessorCount)));
     return(crawlerConfiguration);
 }
        public static CrawlerConfiguration PdfTextExtractProcessor(this CrawlerConfiguration crawlerConfiguration,
                                                                   int?maxDegreeOfParallelism = null)
        {
            PdfBoxTextExtractorProcessorPipelineStep filterTextExtractorProcessor =
                new PdfBoxTextExtractorProcessorPipelineStep(maxDegreeOfParallelism.GetValueOrDefault(Environment.ProcessorCount));

            crawlerConfiguration.AddPipelineStep(filterTextExtractorProcessor);
            return(crawlerConfiguration);
        }
        public ArribaClientIndexerItemConsumer(CrawlerConfiguration config, string serviceUrl)
        {
            this.Configuration = config;
            this.ServiceUrl    = serviceUrl;

            // Allow long timeouts (save for huge databases is slow)
            this.Client = new ArribaClient(new Uri(this.ServiceUrl), TimeSpan.FromMinutes(15));
            this.Table  = this.Client[this.Configuration.ArribaTable];
        }
예제 #6
0
        static void Main()
        {
            Application.SetHighDpiMode(HighDpiMode.SystemAware);
            Application.EnableVisualStyles();
            Application.SetCompatibleTextRenderingDefault(false);

            Program.form = new Form1();

            string outputPath = Path.Join(Directory.GetCurrentDirectory(), "output");

            if (Directory.Exists(outputPath))
            {
                Directory.Delete(outputPath, true);
            }
            Directory.CreateDirectory(outputPath);

            var configuration = new CrawlerConfiguration()
            {
                SaveRobotsFile        = true,
                SaveSitemapFiles      = false,
                SaveUrls              = true,
                DeleteHtmlAfterScrape = true,
                SerializeSite         = true,
                SerializeGraph        = true
            };

            var token = new CancellationTokenSource();

            var seedUrls = new Uri[]
            {
                new Uri("https://www.google.com/")
            };

            Program.crawler = new Crawler(configuration, seedUrls, outputPath, token.Token);

            Program.RegisterEvents();

            var task = Task.Run(() =>
            {
                Program.crawler.Crawl();
            });

            Application.Run(Program.form);

            token.Cancel();
            task.Wait();

            token.Dispose();
        }
예제 #7
0
        public TfsItemProvider(CrawlerConfiguration config)
        {
            this.DatabaseUri    = config.ItemDatabaseName;
            this.Query          = config.ItemQuery ?? DefaultQuery;
            this.ColumnMappings = config.ColumnMappings;

            this.SerializerSettings = new JsonSerializerSettings();
            this.SerializerSettings.Converters.Add(new AttachmentCollectionJsonConverter());
            this.SerializerSettings.Converters.Add(new LinkCollectionJsonConverter());
            this.SerializerSettings.Converters.Add(new RevisionCollectionJsonConverter());

            // TODO: Figure out Personal Access Tokens (may only work for REST APIs)
            // https://www.visualstudio.com/en-us/docs/setup-admin/team-services/use-personal-access-tokens-to-authenticate

            // Connect to TFS, using encrypted credentials if found or the current user identity otherwise
            switch ((config.AuthenticationMode ?? String.Empty).ToLowerInvariant())
            {
            case "aad":
                // https://www.visualstudio.com/en-us/docs/setup-admin/team-services/manage-organization-access-for-your-account-vs
                Trace.WriteLine(string.Format("Connecting to '{0}' [AAD]...", this.DatabaseUri));
                this.Store = new WorkItemStore(new TfsTeamProjectCollection(new Uri(this.DatabaseUri), new VssAadCredential()));
                break;
            // Deprecated, but works.
            //case "alternate":
            //    Trace.WriteLine(string.Format("Connecting to '{0}' [Alternate]...", this.DatabaseUri));
            //    string encryptedBase64Password = File.ReadAllText(string.Format(EncryptedPasswordFilePathFormat, config.ConfigurationName));
            //    string unprotectedPassword = DecryptLocalUserPassword(encryptedBase64Password);

            //    //var credential = new TfsClientCredentials(new Microsoft.TeamFoundation.Client.WindowsCredential(new NetworkCredential(config.UserName, unprotectedPassword)));
            //    //credential.AllowInteractive = false;

            //    var tpc = new TfsTeamProjectCollection(new Uri(this.DatabaseUri), credential);
            //    this.Store = new WorkItemStore(tpc);
            //    break;
            case "integrated":
            case "":
                Trace.WriteLine(string.Format("Connecting to '{0}' [Integrated Auth]...", this.DatabaseUri));
                this.Store = new WorkItemStore(this.DatabaseUri);
                break;

            default:
                throw new NotImplementedException(string.Format("TfsItemProvider has no implementation for authenticationMode \"{0}\". Use 'aad', 'token', or 'integrated'.", config.AuthenticationMode));
            }

            // Debug Only: Get the fields list
            //IList<string> fields = GetStoreFields();
            //File.WriteAllLines(string.Format("{0}.Fields.txt", config.ArribaTable), fields);
        }
        public static IItemProvider Build(CrawlerConfiguration config)
        {
            if (config == null)
            {
                throw new ArgumentNullException("config", "config is null.");
            }

            switch (config.ItemProvider.ToLowerInvariant())
            {
            case "":
            case "tfsitemprovider":
                return(new TfsItemProvider(config));

            default:
                throw new InvalidOperationException(String.Format("{0} is an unknown Item Provider", config.ItemProvider));
            }
        }
예제 #9
0
        public static int EncryptPassword(CrawlerConfiguration config)
        {
            Console.Write("Enter TFS Online Password to local user encrypt: ");
            string password = Console.ReadLine();

            if (String.IsNullOrEmpty(password))
            {
                return(-1);
            }

            string encryptedPasswordPath = string.Format(TfsItemProvider.EncryptedPasswordFilePathFormat, config.ConfigurationName);
            string encryptedPassword     = TfsItemProvider.LocalUserEncryptPassword(password);

            File.WriteAllText(encryptedPasswordPath, encryptedPassword);

            Console.WriteLine("Encrypted Password written to '{0}'. Run Crawler to test.", encryptedPasswordPath);
            Console.WriteLine();

            return(0);
        }
예제 #10
0
        public IActionResult AddDouBanGroup([FromBody] JToken model)
        {
            string doubanGroup = model?["groupId"].ToString();
            string cityName    = model?["cityName"].ToString();

            if (string.IsNullOrEmpty(doubanGroup) || string.IsNullOrEmpty(cityName))
            {
                return(Ok(new { success = false, error = "请输入豆瓣小组Group和城市名称。" }));
            }
            var topics = DoubanHouseCrawler.GetHouseData(doubanGroup, cityName, 1);

            if (topics != null && topics.Count() > 0)
            {
                var cityInfo     = $"{{ 'groupid':'{doubanGroup}','cityname':'{cityName}','pagecount':5}}";
                var doubanConfig = new CrawlerConfiguration();
                if (doubanConfig != null)
                {
                    return(Ok(new { success = true }));
                }
                var config = new CrawlerConfiguration()
                {
                    ConfigurationKey   = 0,
                    ConfigurationValue = cityInfo,
                    ConfigurationName  = ConstConfigName.Douban,
                    DataCreateTime     = DateTime.Now,
                    IsEnabled          = true,
                };
                configurationDapper.Insert(config);
                return(Ok(new { success = true }));
            }
            else
            {
                return(Ok(new
                {
                    success = false,
                    error = "保存失败!请检查豆瓣小组ID(如:XMhouse)/城市名称(如:厦门)是否正确..."
                }));
            }
        }
예제 #11
0
        public static IItemConsumer Build(CrawlerConfiguration config)
        {
            if (config == null)
            {
                throw new ArgumentNullException("config", "config is null.");
            }

            switch (config.ItemConsumer.ToLowerInvariant())
            {
            case "":
            case "arribaclient":
                return(new ArribaClientIndexerItemConsumer(config, config.ArribaServiceUrl ?? "http://localhost:42784"));

            case "arribadirect":
                return(new ArribaDirectIndexerItemConsumer(config));

            case "csvwriter":
                return(new CsvWriterItemConsumer(config.ArribaTable, "Changed Date"));

            default:
                throw new InvalidOperationException(String.Format("{0} is an unknown Item Consumer", config.ItemConsumer));
            }
        }
예제 #12
0
 public Crawler(CrawlerConfiguration configuration)
 {
     // TODO: Implement this and delete the other .ctor
 }
예제 #13
0
 public Crawler(CrawlerConfiguration configuration) : this()
 {
     _configuration = configuration;
 }
예제 #14
0
 public static CrawlerConfiguration Robots(this CrawlerConfiguration crawlerConfiguration, string searchPath = null)
 {
     crawlerConfiguration.AddPipelineStep(new RobotsPipelineStep(searchPath, crawlerConfiguration.Logger));
     return(crawlerConfiguration);
 }
예제 #15
0
 public Crawler(CrawlerConfiguration configuration)
     : this()
 {
     _configuration = configuration;
 }