public async Task DoAsync() { var bucketName = "example-datasets"; var objectName = "tweets_tiny_no_headers.csv"; objectName = "tweets_tiny.csv"; objectName = "tweets.csv"; var mappingName = "tweets.csv.mapping"; objectName = "tweets.apple.json"; mappingName = "tweets.json.mapping"; var attributesDict = new Dictionary <string, string> { { "bucketName", bucketName }, { "objectName", objectName }, { "mappingName", mappingName } }; var attributes = new DataAcquirerAttributes(attributesDict); var guid = Guid.NewGuid(); var daInput = new DataAcquirerInputModel( guid, null, null, attributes, 0 ); var posts = _dataAcquirer.GetPostsAsync(daInput, CancellationToken.None); await foreach (var item in posts) { Console.WriteLine(item.Text); } }
public async IAsyncEnumerable <DataAcquirerPost> GetPostsAsync( DataAcquirerInputModel acquirerInputModel, [EnumeratorCancellation] CancellationToken cancellationToken = default) { var credentials = ExtractCredentials(acquirerInputModel); var twitterContext = await _twitterContextProvider.GetContextAsync(credentials); var subqueries = ParseTwitterQuery(acquirerInputModel.Query); var asyncEnumerators = subqueries.Select(query => { var batchLoader = _twitterBatchLoaderFactory.Create(acquirerInputModel.JobId); var defaultMetadata = CreateDefaultMetadata(query, acquirerInputModel); return(batchLoader.CreateBatchPostEnumerator(twitterContext, defaultMetadata)); }); var it = AsyncEnumeratorConfluctor.AggregateEnumerables( asyncEnumerators, cancellationToken); await foreach (var item in it) { if (cancellationToken.IsCancellationRequested) { yield break; } yield return(item); } }
private RedditCredentials ExtractCredentials(DataAcquirerInputModel acquirerInputModel) { return(new RedditCredentials( acquirerInputModel.Attributes.GetValue("appId"), acquirerInputModel.Attributes.GetValue("appSecret"), acquirerInputModel.Attributes.GetValue("refreshToken"))); }
private async Task RunJobAsync(DataAcquirerJobConfig jobConfig, CancellationToken cancellationToken) { try { var translate = jobConfig.Attributes.TryGetValue("Translate", out string value) //&& value.ToLower() == "true" ; // TODO validate job config if (!jobConfig.Attributes.ContainsKey("TopicQuery")) { _logger.TrackError( "StartNewJob", "TopicQuery attribute is not present. Job did not start", new { jobId = jobConfig.JobId }); return; } string queryLanguage = null; if (jobConfig.Attributes.TryGetValue("Language", out var desiredLanguage)) { queryLanguage = desiredLanguage; } await _dataAcquirerJobStorage.SaveAsync(jobConfig.JobId, jobConfig); var batchSize = 100; var dataAcquirerInputModel = DataAcquirerInputModel.FromValues( jobConfig.JobId, jobConfig.Attributes["TopicQuery"], queryLanguage, new DataAcquirerAttributes(jobConfig.Attributes), batchSize ); var batch = _acquirer.GetPostsAsync( dataAcquirerInputModel, cancellationToken); _logger.TrackInfo("MessageTracking", "Starting"); await ProcessBatch(jobConfig, dataAcquirerInputModel, batch, translate); } catch (TaskCanceledException) { } catch (Exception e) { _runningJobsRecords.Remove(jobConfig.JobId); _logger.TrackError( "RunJob", "Job encountered an error and stopped.", new { jobId = jobConfig.JobId, exception = e }); } }
private static TwitterCredentials ExtractCredentials(DataAcquirerInputModel acquirerInputModel) { // TODO validate credentials return(new TwitterCredentials { ConsumerKey = acquirerInputModel.Attributes.GetValue("ApiKey"), ConsumerSecret = acquirerInputModel.Attributes.GetValue("ApiSecretKey"), AccessToken = acquirerInputModel.Attributes.GetValue("AccessToken"), AccessTokenSecret = acquirerInputModel.Attributes.GetValue("AccessTokenSecret") }); }
private static TwitterMetadata CreateDefaultMetadata( string query, DataAcquirerInputModel acquirerInputModel) { return(new TwitterMetadata { MaxId = ulong.MaxValue, SinceId = 0, Language = acquirerInputModel.QueryLanguage, Query = query, BatchSize = acquirerInputModel.BatchSize }); }
public async IAsyncEnumerable <DataAcquirerPost> GetPostsAsync( DataAcquirerInputModel jobConfig, [EnumeratorCancellation] CancellationToken cancellationToken) { await Task.Delay(_downloadDelay); var uniPosts = Enumerable .Range(0, 100) .Select(r => _random.Next()) .Select(GetRandomPost); foreach (var post in uniPosts) { yield return(post); } }
public async IAsyncEnumerable <DataAcquirerPost> GetPostsAsync( DataAcquirerInputModel acquirerInputModel, [EnumeratorCancellation] CancellationToken cancellationToken) { ulong id = 0; while (true) { var count = acquirerInputModel.BatchSize; var posts = _postsEnumerator .Take(count) .Select(post => DataAcquirerPost.FromValues( post.OriginalPostId, post.Text, post.Language, post.Source, post.UserId, post.PostDateTime)) .ToList(); id += (ulong)count; try { await Task.Delay(_downloadSimulatedDelay, CancellationToken.None); } catch (TaskCanceledException) { } foreach (var post in posts) { yield return(post); } ; } }
static async Task MainAsync(string[] args) { var services = Configure(); var credentialsOptions = services .GetRequiredService <IOptions <RedditCredentialsOptions> >(); var credentials = credentialsOptions.Value; var redditAcquirer = services.GetRequiredService <IDataAcquirer>(); var attributesDict = new Dictionary <string, string>() { { "appId", credentials.AppId }, { "appSecret", credentials.AppSecret }, { "refreshToken", credentials.RefreshToken } }; var attributes = new DataAcquirerAttributes(attributesDict); var query = "snake bites NOT piercing NOT darts NOT music"; var jobId = Guid.NewGuid(); var inputModel = new DataAcquirerInputModel( jobId, query, null, attributes, 3); var batch = redditAcquirer.GetPostsAsync(inputModel); await foreach (var item in batch) { Console.WriteLine(JsonConvert.SerializeObject(item, Formatting.Indented)); } Console.WriteLine("Search ended"); Console.ReadLine(); }
private async Task ProcessBatch( DataAcquirerJobConfig jobConfig, DataAcquirerInputModel dataAcquirerInputModel, IAsyncEnumerable <DataAcquirerPost> batch, bool translate) { int count = 0; await foreach (var dataPost in batch) { LogProgress(jobConfig, count); count++; var postId = CalculatePostId(jobConfig, dataPost); var text = ClearText(dataPost.Text); string originalText = null; if (translate && dataPost.Language != "en" && dataPost.Language != null) { try { var translatedText = await _translationService .TranslateToEnglishAsync(dataPost.Language, text); originalText = text; text = translatedText; } catch (DataAcquirerException ex) { _logger.TrackWarning("TranslationError", "Could not translate", new { jobId = dataAcquirerInputModel.JobId, exception = ex, text }); } } var uniPost = UniPostModel.FromValues( postId, dataPost.OriginalPostId, text, originalText, dataPost.Language, dataPost.Source, dataPost.UserId, dataPost.DateTime, dataAcquirerInputModel.JobId, dataPost.Query); var jsonData = JsonConvert.SerializeObject(uniPost); var messageBrokerMessage = new MessageBrokerMessage( "acquired-data-post", jsonData); await SendRecordToOutputs(jobConfig.OutputMessageBrokerChannels, messageBrokerMessage); } }
public async IAsyncEnumerable <DataAcquirerPost> GetPostsAsync( DataAcquirerInputModel acquirerInputModel, [EnumeratorCancellation] CancellationToken cancellationToken) { var credentials = ExtractCredentials(acquirerInputModel); var reddit = await _redditContextProvider.GetContextAsync(credentials); var query = acquirerInputModel.Query; var limit = 50; DateTime?before = null; while (true) { var maxBefore = before; var count = 0; string after = null; var postListing = GetPosts(reddit, after, limit, query, count); var outDated = false; while (postListing.Count > 0) { var children = postListing; foreach (var item in children) { if (item.Created <= before) { outDated = true; break; } count++; maxBefore = Max(item.Created, maxBefore); if (!string.IsNullOrWhiteSpace(item.Listing.SelfText)) { yield return(FromPost(item, query)); } var comments = item.Comments.GetTop(100); foreach (var c in comments) { if (string.IsNullOrWhiteSpace(c.Body)) { continue; } var listingPost = item.Listing; yield return(DataAcquirerPost.FromValues( listingPost.Id, //"(title:" + listingPost.Title + ",comment)" + c.Body, c.Body, "en", "reddit", c.Author ?? "n/a", listingPost.CreatedUTC.ToString("s"), query)); } } if (outDated) { break; } after = postListing.Count > 0 ? postListing.Last().Fullname : after; postListing = GetPosts(reddit, after, limit, query, count); } before = maxBefore; await Task.Delay(TimeSpan.FromMinutes(10)); } }
public async IAsyncEnumerable <DataAcquirerPost> GetPostsAsync( DataAcquirerInputModel acquirerInputModel, [EnumeratorCancellation] CancellationToken cancellationToken = default) { var minio = new MinioClient( _endpoint, _accessKey, _secret); var attributes = acquirerInputModel.Attributes; var bucketName = attributes.GetValue(_bucketElementName, null); var objectName = attributes.GetValue(_objectElementName, null); var mappingName = attributes.GetValue(_mappingElementName, null); var atts = await GetMappingAttributesAsync(minio, bucketName, mappingName); if (atts == null || string.IsNullOrEmpty(atts.DataFormat) || atts.MappingAttributes == null) { throw new InvalidOperationException("Invalid config"); } try { await minio.StatObjectAsync(bucketName, objectName); } catch (MinioException) { //_logger.TrackError( // "CustomDataAcquirer", // $"Object '{bucketName}-{objectName}' does not exist", // new // { // bucketName, // objectName, // exception = e // }); throw new InvalidOperationException($"Mapping object {mappingName} does not exits"); } var reader = _customStreamReaderFactory.Create(atts); using (var cts = new CancellationTokenSource()) { var listeningTask = Task.Run(async() => { await minio.GetObjectAsync(bucketName, objectName, reader.StartPopulating) .ConfigureAwait(false); }); while (!reader.IsCompleted) { if (reader.TryGetPost(out var post)) { var validation = PostValidator.ValidatePost(post); if (validation.IsSuccessful) { yield return(post); } else { _logger.TrackWarning( "CustomStaticData", "Invalid post encountered", new { errorMessage = validation.ErrorMessage, post }); } } else { await Task.Delay(TimeSpan.FromSeconds(1)); } } cts.Cancel(); try { await listeningTask; } catch (TaskCanceledException) { } } }
public async Task TestStorageAsync() { var limit = (ulong)301; var maxTweetId = (ulong)300; var testTwitterContext = new TestTwitterContext(limit, maxTweetId); var starting = (ulong)100; var batchSize = 3; var query = "foo"; string language = null; var jobId = Guid.Parse("d2474631-3d4c-4b07-a992-9d9c1f269cd4"); var metadata = new TwitterMetadata { BatchSize = 3, MaxId = starting, SinceId = 0, Query = "foo", Language = null }; var metadataContextMock = new Mock <IDataAcquirerMetadataContext>(); metadataContextMock .Setup(r => r.GetOrCreateAsync(It.IsAny <TwitterMetadata>())) .Returns(Task.FromResult(metadata)); var metaDataContextProviderMock = new Mock <IDataAcquirerMetadataContextProvider>(); metaDataContextProviderMock.Setup(r => r.Get(It.IsAny <Guid>())) .Returns(metadataContextMock.Object); var batchLoaderOptions = Options.Create(new TwitterBatchLoaderOptions { NoPostWaitDelay = TimeSpan.Zero, ErrorEncounteredWaitDelay = TimeSpan.Zero, RateLimitExceededWaitDelay = TimeSpan.Zero }); var batchLoaderFactory = new TwitterBatchLoaderFactory( batchLoaderOptions, new Mock <IEventTracker <TwitterBatchLoader> >().Object, metaDataContextProviderMock.Object); var contextProviderMock = new Mock <ITwitterContextProvider>(); contextProviderMock .Setup(r => r.GetContextAsync(It.IsAny <TwitterCredentials>())) .Returns(Task.FromResult <ITwitterContext>(testTwitterContext)); var trackerMocq = new Mock <IEventTracker <TwitterDataAcquirer> >(); var twitterDataAcquirer = new TwitterDataAcquirer( batchLoaderFactory, contextProviderMock.Object, trackerMocq.Object); var input = new DataAcquirerInputModel( jobId, query, language, new DataAcquirerAttributes(new Dictionary <string, string>()), batchSize); var posts = twitterDataAcquirer.GetPostsAsync(input); var ids = new List <ulong>(); await foreach (var post in posts) { if ((ulong)ids.Count >= maxTweetId) { break; } var idStr = post.OriginalPostId; var id = ulong.Parse(idStr); ids.Add(id); } var actual = ids.OrderBy(r => r).ToList(); var expected = Enumerable.Range(1, (int)maxTweetId).Select(r => (ulong)r).ToList(); CollectionAssert.AreEqual( expected, actual); }