public void UpdateAnswersWithQuestionTags(string path, int size) { if (!_client.Indices.Exists(PostsIndex).Exists) { throw new Exception($"{PostsIndex} index does not exist. You must run the 'posts' command to index posts first"); } var postIdsAndTags = StackOverflowData.GetPostTagsWithAnswers(path); long totalAnswersUpdated = 0; var totalQuestions = 0; var stopWatch = Stopwatch.StartNew(); foreach (var batch in postIdsAndTags.Batch(size)) { var tasks = batch.Select(b => { var(id, tags) = b; return(_client.UpdateByQueryAsync <Answer>(u => u .Routing(id) .Query(q => + q .ParentId(p => p .Id(id) .Type <Answer>() ) ) .Conflicts(Conflicts.Proceed) .Index(PostsIndex) .Timeout(TimeSpan.FromMinutes(1)) .WaitForCompletion() .Script(ss => ss .Source(@"if (ctx._source.tags == null) { ctx._source.tags = params.tags; } else { ctx.op = ""noop""; }") .Params(p => p .Add("tags", tags) ) ) )); }).ToArray(); var task = Task.WhenAll(tasks); task.Wait(); if (task.Status == TaskStatus.Faulted) { throw task.Exception.Flatten(); } totalQuestions += tasks.Length; totalAnswersUpdated += tasks.Sum(t => t.Result.Updated); Log.WriteLine($"Updated {totalAnswersUpdated} answers for {totalQuestions} questions"); } Log.WriteLine($"time taken to update answers: {stopWatch.Elapsed}"); }
public void IndexUsers(string usersPath, string badgesPath) { CreateUsersIndexIfNotExists(); _client.Indices.UpdateSettings(UsersIndex, u => u .IndexSettings(i => i .RefreshInterval("-1") ) ); var size = 1000; var seenPages = 0; var indexedDocs = 0; var totalDocs = 0; var handle = new ManualResetEvent(false); var users = StackOverflowData.GetUsers(usersPath); var observableBulk = _client.BulkAll(users, f => f .MaxDegreeOfParallelism(16) .BackOffTime(TimeSpan.FromSeconds(10)) .BackOffRetries(2) .Size(size) .RefreshOnCompleted() .Index(UsersIndex) ); Exception exception = null; var bulkObserver = new BulkAllObserver( onError: e => { exception = e; handle.Set(); }, onCompleted: () => handle.Set(), onNext: b => { Interlocked.Add(ref indexedDocs, b.Items.Count(i => i.IsValid)); Interlocked.Add(ref totalDocs, b.Items.Count); Interlocked.Increment(ref seenPages); Log.WriteLine($"indexed users page {seenPages}, {indexedDocs} out of {totalDocs}"); } ); var stopWatch = Stopwatch.StartNew(); observableBulk.Subscribe(bulkObserver); handle.WaitOne(); if (exception != null) { throw exception; } Log.WriteLine($"Time taken to index users: {stopWatch.Elapsed}"); // update user badges seenPages = 0; indexedDocs = 0; totalDocs = 0; handle = new ManualResetEvent(false); var badgeMetas = StackOverflowData.GetBadgeMetas(badgesPath); var observableBadgeBulk = _client.BulkAll(badgeMetas, f => f .Index <User>() .MaxDegreeOfParallelism(8) .Size(size) .BufferToBulk((bulk, badges) => { foreach (var badge in badges) { bulk.Update <User>(u => u .Script(s => s .Source(@"if (ctx._source.badges == null) { ctx._source.badges = [params.badge]; } else if (ctx._source.badges.any(b -> b.name == params.badge.name) == false) { ctx._source.badges.add(params.badge); }") .Params(d => d .Add("badge", badge.Badge) ) ) .Id(badge.UserId) .RetriesOnConflict(10) ); } }) .RefreshOnCompleted() ); bulkObserver = new BulkAllObserver( onError: e => { exception = e; handle.Set(); }, onCompleted: () => handle.Set(), onNext: b => { Interlocked.Add(ref indexedDocs, b.Items.Count(i => i.IsValid)); Interlocked.Add(ref totalDocs, b.Items.Count); Interlocked.Increment(ref seenPages); Log.WriteLine($"indexed badges page {seenPages}, {indexedDocs} out of {totalDocs}"); } ); stopWatch.Restart(); observableBadgeBulk.Subscribe(bulkObserver); handle.WaitOne(); if (exception != null) { throw exception; } Log.WriteLine($"Time taken to index badges: {stopWatch.Elapsed}"); _client.Indices.UpdateSettings(UsersIndex, u => u .IndexSettings(i => i .RefreshInterval("30s") ) ); }
public void IndexPosts(string path) { CreatePostsIndexIfNotExists(); _client.Indices.UpdateSettings(PostsIndex, u => u .IndexSettings(i => i .RefreshInterval("-1") ) ); var handle = new ManualResetEvent(false); var size = 1000; var posts = StackOverflowData.GetPosts(path); var observableBulk = _client.BulkAll(posts, f => f .MaxDegreeOfParallelism(Environment.ProcessorCount * 2) .BackOffTime(TimeSpan.FromSeconds(10)) .BackOffRetries(2) .Size(size) .BufferToBulk((bulk, buffer) => { foreach (var post in buffer) { if (post is Question question) { var item = new BulkIndexOperation <Question>(question); bulk.AddOperation(item); } else { var answer = (Answer)post; var item = new BulkIndexOperation <Answer>(answer); bulk.AddOperation(item); } } }) .RefreshOnCompleted() .Index(PostsIndex) ); var seenPages = 0; var indexedDocs = 0; var totalDocs = 0; Exception exception = null; var bulkObserver = new BulkAllObserver( onError: e => { exception = e; handle.Set(); }, onCompleted: () => handle.Set(), onNext: b => { Interlocked.Add(ref indexedDocs, b.Items.Count(i => i.IsValid)); Interlocked.Add(ref totalDocs, b.Items.Count); Interlocked.Increment(ref seenPages); Log.WriteLine($"indexed page {seenPages} of questions and answers, {indexedDocs} out of {totalDocs}"); } ); var stopWatch = Stopwatch.StartNew(); observableBulk.Subscribe(bulkObserver); handle.WaitOne(); if (exception != null) { throw exception; } Log.WriteLine($"time taken to index posts: {stopWatch.Elapsed}"); _client.Indices.UpdateSettings(PostsIndex, u => u .IndexSettings(i => i .RefreshInterval("30s") ) ); }