コード例 #1
0
        public async Task DoAsync()
        {
            var bucketName = "example-datasets";
            var objectName = "tweets_tiny_no_headers.csv";

            objectName = "tweets_tiny.csv";
            objectName = "tweets.csv";
            var mappingName = "tweets.csv.mapping";

            objectName  = "tweets.apple.json";
            mappingName = "tweets.json.mapping";

            var attributesDict = new Dictionary <string, string> {
                { "bucketName", bucketName },
                { "objectName", objectName },
                { "mappingName", mappingName }
            };
            var attributes = new DataAcquirerAttributes(attributesDict);
            var guid       = Guid.NewGuid();
            var daInput    = new DataAcquirerInputModel(
                guid,
                null,
                null,
                attributes,
                0
                );
            var posts = _dataAcquirer.GetPostsAsync(daInput, CancellationToken.None);

            await foreach (var item in posts)
            {
                Console.WriteLine(item.Text);
            }
        }
コード例 #2
0
        public async IAsyncEnumerable <DataAcquirerPost> GetPostsAsync(
            DataAcquirerInputModel acquirerInputModel,
            [EnumeratorCancellation] CancellationToken cancellationToken = default)
        {
            var credentials    = ExtractCredentials(acquirerInputModel);
            var twitterContext = await _twitterContextProvider.GetContextAsync(credentials);

            var subqueries = ParseTwitterQuery(acquirerInputModel.Query);

            var asyncEnumerators = subqueries.Select(query =>
            {
                var batchLoader     = _twitterBatchLoaderFactory.Create(acquirerInputModel.JobId);
                var defaultMetadata = CreateDefaultMetadata(query, acquirerInputModel);
                return(batchLoader.CreateBatchPostEnumerator(twitterContext, defaultMetadata));
            });

            var it = AsyncEnumeratorConfluctor.AggregateEnumerables(
                asyncEnumerators,
                cancellationToken);

            await foreach (var item in it)
            {
                if (cancellationToken.IsCancellationRequested)
                {
                    yield break;
                }
                yield return(item);
            }
        }
コード例 #3
0
 private RedditCredentials ExtractCredentials(DataAcquirerInputModel acquirerInputModel)
 {
     return(new RedditCredentials(
                acquirerInputModel.Attributes.GetValue("appId"),
                acquirerInputModel.Attributes.GetValue("appSecret"),
                acquirerInputModel.Attributes.GetValue("refreshToken")));
 }
コード例 #4
0
        private async Task RunJobAsync(DataAcquirerJobConfig jobConfig,
                                       CancellationToken cancellationToken)
        {
            try
            {
                var translate = jobConfig.Attributes.TryGetValue("Translate", out string value)
                                //&& value.ToLower() == "true"
                ;
                // TODO validate job config
                if (!jobConfig.Attributes.ContainsKey("TopicQuery"))
                {
                    _logger.TrackError(
                        "StartNewJob",
                        "TopicQuery attribute is not present. Job did not start",
                        new { jobId = jobConfig.JobId });
                    return;
                }
                string queryLanguage = null;
                if (jobConfig.Attributes.TryGetValue("Language", out var desiredLanguage))
                {
                    queryLanguage = desiredLanguage;
                }

                await _dataAcquirerJobStorage.SaveAsync(jobConfig.JobId, jobConfig);

                var batchSize = 100;

                var dataAcquirerInputModel = DataAcquirerInputModel.FromValues(
                    jobConfig.JobId,
                    jobConfig.Attributes["TopicQuery"],
                    queryLanguage,
                    new DataAcquirerAttributes(jobConfig.Attributes),
                    batchSize
                    );

                var batch = _acquirer.GetPostsAsync(
                    dataAcquirerInputModel,
                    cancellationToken);

                _logger.TrackInfo("MessageTracking", "Starting");

                await ProcessBatch(jobConfig, dataAcquirerInputModel, batch, translate);
            }
            catch (TaskCanceledException) { }
            catch (Exception e)
            {
                _runningJobsRecords.Remove(jobConfig.JobId);
                _logger.TrackError(
                    "RunJob",
                    "Job encountered an error and stopped.",
                    new
                {
                    jobId     = jobConfig.JobId,
                    exception = e
                });
            }
        }
コード例 #5
0
 private static TwitterCredentials ExtractCredentials(DataAcquirerInputModel acquirerInputModel)
 {
     // TODO validate credentials
     return(new TwitterCredentials
     {
         ConsumerKey = acquirerInputModel.Attributes.GetValue("ApiKey"),
         ConsumerSecret = acquirerInputModel.Attributes.GetValue("ApiSecretKey"),
         AccessToken = acquirerInputModel.Attributes.GetValue("AccessToken"),
         AccessTokenSecret = acquirerInputModel.Attributes.GetValue("AccessTokenSecret")
     });
 }
コード例 #6
0
 private static TwitterMetadata CreateDefaultMetadata(
     string query,
     DataAcquirerInputModel acquirerInputModel)
 {
     return(new TwitterMetadata
     {
         MaxId = ulong.MaxValue,
         SinceId = 0,
         Language = acquirerInputModel.QueryLanguage,
         Query = query,
         BatchSize = acquirerInputModel.BatchSize
     });
 }
コード例 #7
0
        public async IAsyncEnumerable <DataAcquirerPost> GetPostsAsync(
            DataAcquirerInputModel jobConfig,
            [EnumeratorCancellation] CancellationToken cancellationToken)
        {
            await Task.Delay(_downloadDelay);

            var uniPosts = Enumerable
                           .Range(0, 100)
                           .Select(r => _random.Next())
                           .Select(GetRandomPost);

            foreach (var post in uniPosts)
            {
                yield return(post);
            }
        }
コード例 #8
0
        public async IAsyncEnumerable <DataAcquirerPost> GetPostsAsync(
            DataAcquirerInputModel acquirerInputModel,
            [EnumeratorCancellation] CancellationToken cancellationToken)
        {
            ulong id = 0;

            while (true)
            {
                var count = acquirerInputModel.BatchSize;

                var posts = _postsEnumerator
                            .Take(count)
                            .Select(post =>
                                    DataAcquirerPost.FromValues(
                                        post.OriginalPostId,
                                        post.Text,
                                        post.Language,
                                        post.Source,
                                        post.UserId,
                                        post.PostDateTime))
                            .ToList();


                id += (ulong)count;
                try
                {
                    await Task.Delay(_downloadSimulatedDelay, CancellationToken.None);
                }
                catch (TaskCanceledException)
                {
                }

                foreach (var post in posts)
                {
                    yield return(post);
                }
                ;
            }
        }
コード例 #9
0
        static async Task MainAsync(string[] args)
        {
            var services = Configure();

            var credentialsOptions = services
                                     .GetRequiredService <IOptions <RedditCredentialsOptions> >();
            var credentials = credentialsOptions.Value;

            var redditAcquirer = services.GetRequiredService <IDataAcquirer>();

            var attributesDict = new Dictionary <string, string>()
            {
                { "appId", credentials.AppId },
                { "appSecret", credentials.AppSecret },
                { "refreshToken", credentials.RefreshToken }
            };

            var attributes = new DataAcquirerAttributes(attributesDict);
            var query      = "snake bites NOT piercing NOT darts NOT music";
            var jobId      = Guid.NewGuid();
            var inputModel = new DataAcquirerInputModel(
                jobId,
                query,
                null,
                attributes,
                3);

            var batch = redditAcquirer.GetPostsAsync(inputModel);

            await foreach (var item in batch)
            {
                Console.WriteLine(JsonConvert.SerializeObject(item, Formatting.Indented));
            }
            Console.WriteLine("Search ended");
            Console.ReadLine();
        }
コード例 #10
0
        private async Task ProcessBatch(
            DataAcquirerJobConfig jobConfig,
            DataAcquirerInputModel dataAcquirerInputModel,
            IAsyncEnumerable <DataAcquirerPost> batch,
            bool translate)
        {
            int count = 0;

            await foreach (var dataPost in batch)
            {
                LogProgress(jobConfig, count);
                count++;

                var postId = CalculatePostId(jobConfig, dataPost);

                var    text         = ClearText(dataPost.Text);
                string originalText = null;

                if (translate && dataPost.Language != "en" && dataPost.Language != null)
                {
                    try
                    {
                        var translatedText = await _translationService
                                             .TranslateToEnglishAsync(dataPost.Language, text);

                        originalText = text;
                        text         = translatedText;
                    }
                    catch (DataAcquirerException ex)
                    {
                        _logger.TrackWarning("TranslationError", "Could not translate",
                                             new
                        {
                            jobId     = dataAcquirerInputModel.JobId,
                            exception = ex,
                            text
                        });
                    }
                }

                var uniPost = UniPostModel.FromValues(
                    postId,
                    dataPost.OriginalPostId,
                    text,
                    originalText,
                    dataPost.Language,
                    dataPost.Source,
                    dataPost.UserId,
                    dataPost.DateTime,
                    dataAcquirerInputModel.JobId,
                    dataPost.Query);


                var jsonData             = JsonConvert.SerializeObject(uniPost);
                var messageBrokerMessage = new MessageBrokerMessage(
                    "acquired-data-post",
                    jsonData);

                await SendRecordToOutputs(jobConfig.OutputMessageBrokerChannels,
                                          messageBrokerMessage);
            }
        }
コード例 #11
0
        public async IAsyncEnumerable <DataAcquirerPost> GetPostsAsync(
            DataAcquirerInputModel acquirerInputModel,
            [EnumeratorCancellation] CancellationToken cancellationToken)
        {
            var credentials = ExtractCredentials(acquirerInputModel);
            var reddit      = await _redditContextProvider.GetContextAsync(credentials);

            var query = acquirerInputModel.Query;

            var      limit  = 50;
            DateTime?before = null;

            while (true)
            {
                var    maxBefore   = before;
                var    count       = 0;
                string after       = null;
                var    postListing = GetPosts(reddit, after, limit, query, count);
                var    outDated    = false;
                while (postListing.Count > 0)
                {
                    var children = postListing;
                    foreach (var item in children)
                    {
                        if (item.Created <= before)
                        {
                            outDated = true;
                            break;
                        }
                        count++;
                        maxBefore = Max(item.Created, maxBefore);

                        if (!string.IsNullOrWhiteSpace(item.Listing.SelfText))
                        {
                            yield return(FromPost(item, query));
                        }
                        var comments = item.Comments.GetTop(100);
                        foreach (var c in comments)
                        {
                            if (string.IsNullOrWhiteSpace(c.Body))
                            {
                                continue;
                            }
                            var listingPost = item.Listing;
                            yield return(DataAcquirerPost.FromValues(
                                             listingPost.Id,
                                             //"(title:" + listingPost.Title + ",comment)" + c.Body,
                                             c.Body,
                                             "en",
                                             "reddit",
                                             c.Author ?? "n/a",
                                             listingPost.CreatedUTC.ToString("s"),
                                             query));
                        }
                    }

                    if (outDated)
                    {
                        break;
                    }
                    after = postListing.Count > 0 ? postListing.Last().Fullname : after;

                    postListing = GetPosts(reddit, after, limit, query, count);
                }
                before = maxBefore;

                await Task.Delay(TimeSpan.FromMinutes(10));
            }
        }
コード例 #12
0
        public async IAsyncEnumerable <DataAcquirerPost> GetPostsAsync(
            DataAcquirerInputModel acquirerInputModel,
            [EnumeratorCancellation] CancellationToken cancellationToken = default)
        {
            var minio = new MinioClient(
                _endpoint,
                _accessKey,
                _secret);

            var attributes  = acquirerInputModel.Attributes;
            var bucketName  = attributes.GetValue(_bucketElementName, null);
            var objectName  = attributes.GetValue(_objectElementName, null);
            var mappingName = attributes.GetValue(_mappingElementName, null);

            var atts = await GetMappingAttributesAsync(minio, bucketName, mappingName);

            if (atts == null ||
                string.IsNullOrEmpty(atts.DataFormat) ||
                atts.MappingAttributes == null)
            {
                throw new InvalidOperationException("Invalid config");
            }

            try
            {
                await minio.StatObjectAsync(bucketName, objectName);
            }
            catch (MinioException)
            {
                //_logger.TrackError(
                //    "CustomDataAcquirer",
                //    $"Object '{bucketName}-{objectName}' does not exist",
                //    new
                //    {
                //        bucketName,
                //        objectName,
                //        exception = e
                //    });
                throw new InvalidOperationException($"Mapping object {mappingName} does not exits");
            }

            var reader = _customStreamReaderFactory.Create(atts);

            using (var cts = new CancellationTokenSource())
            {
                var listeningTask = Task.Run(async() =>
                {
                    await minio.GetObjectAsync(bucketName, objectName, reader.StartPopulating)
                    .ConfigureAwait(false);
                });

                while (!reader.IsCompleted)
                {
                    if (reader.TryGetPost(out var post))
                    {
                        var validation = PostValidator.ValidatePost(post);
                        if (validation.IsSuccessful)
                        {
                            yield return(post);
                        }
                        else
                        {
                            _logger.TrackWarning(
                                "CustomStaticData",
                                "Invalid post encountered",
                                new
                            {
                                errorMessage = validation.ErrorMessage,
                                post
                            });
                        }
                    }
                    else
                    {
                        await Task.Delay(TimeSpan.FromSeconds(1));
                    }
                }
                cts.Cancel();
                try
                {
                    await listeningTask;
                }
                catch (TaskCanceledException) { }
            }
        }
コード例 #13
0
        public async Task TestStorageAsync()
        {
            var    limit              = (ulong)301;
            var    maxTweetId         = (ulong)300;
            var    testTwitterContext = new TestTwitterContext(limit, maxTweetId);
            var    starting           = (ulong)100;
            var    batchSize          = 3;
            var    query              = "foo";
            string language           = null;
            var    jobId              = Guid.Parse("d2474631-3d4c-4b07-a992-9d9c1f269cd4");

            var metadata = new TwitterMetadata
            {
                BatchSize = 3,
                MaxId     = starting,
                SinceId   = 0,
                Query     = "foo",
                Language  = null
            };
            var metadataContextMock = new Mock <IDataAcquirerMetadataContext>();

            metadataContextMock
            .Setup(r => r.GetOrCreateAsync(It.IsAny <TwitterMetadata>()))
            .Returns(Task.FromResult(metadata));

            var metaDataContextProviderMock = new Mock <IDataAcquirerMetadataContextProvider>();

            metaDataContextProviderMock.Setup(r => r.Get(It.IsAny <Guid>()))
            .Returns(metadataContextMock.Object);

            var batchLoaderOptions = Options.Create(new TwitterBatchLoaderOptions
            {
                NoPostWaitDelay            = TimeSpan.Zero,
                ErrorEncounteredWaitDelay  = TimeSpan.Zero,
                RateLimitExceededWaitDelay = TimeSpan.Zero
            });

            var batchLoaderFactory = new TwitterBatchLoaderFactory(
                batchLoaderOptions,
                new Mock <IEventTracker <TwitterBatchLoader> >().Object,
                metaDataContextProviderMock.Object);

            var contextProviderMock = new Mock <ITwitterContextProvider>();

            contextProviderMock
            .Setup(r => r.GetContextAsync(It.IsAny <TwitterCredentials>()))
            .Returns(Task.FromResult <ITwitterContext>(testTwitterContext));

            var trackerMocq         = new Mock <IEventTracker <TwitterDataAcquirer> >();
            var twitterDataAcquirer = new TwitterDataAcquirer(
                batchLoaderFactory,
                contextProviderMock.Object,
                trackerMocq.Object);


            var input = new DataAcquirerInputModel(
                jobId,
                query,
                language,
                new DataAcquirerAttributes(new Dictionary <string, string>()),
                batchSize);

            var posts = twitterDataAcquirer.GetPostsAsync(input);

            var ids = new List <ulong>();

            await foreach (var post in posts)
            {
                if ((ulong)ids.Count >= maxTweetId)
                {
                    break;
                }
                var idStr = post.OriginalPostId;
                var id    = ulong.Parse(idStr);
                ids.Add(id);
            }

            var actual   = ids.OrderBy(r => r).ToList();
            var expected = Enumerable.Range(1, (int)maxTweetId).Select(r => (ulong)r).ToList();

            CollectionAssert.AreEqual(
                expected,
                actual);
        }