private void BulkAll(string index, IEnumerable <SmallObject> documents, int size, int pages, int numberOfDocuments) { var handle = new ManualResetEvent(false); var seenPages = 0; //first we setup our cold observable var observableBulk = this._client.BulkAll(documents, f => f .MaxDegreeOfParallelism(8) .BackOffTime(TimeSpan.FromSeconds(10)) .BackOffRetries(2) .Size(size) .RefreshOnCompleted() .Index(index) ); //we set up an observer var bulkObserver = new BulkAllObserver( onError: (e) => { handle.Set(); throw e; }, onCompleted: () => handle.Set(), onNext: (b) => Interlocked.Increment(ref seenPages) ); //when we subscribe the observable becomes hot observableBulk.Subscribe(bulkObserver); handle.WaitOne(TimeSpan.FromMinutes(5)); seenPages.Should().Be(pages); var count = this._client.Count <SmallObject>(f => f.Index(index)); count.Count.Should().Be(numberOfDocuments); bulkObserver.TotalNumberOfFailedBuffers.Should().Be(0); }
private async Task <int> PushAllImpl(IEnumerable <IDictionary <string, string> > lazyEnumerable, string mappingName) { var seenPages = 0; var tcs = new TaskCompletionSource <int>(); var observableBulk = _client.BulkAll(lazyEnumerable, bulkDescriptor => { bulkDescriptor .BufferToBulk((x, batch) => x.IndexMany(batch, (bd, d) => bd .Id(d["PartitionKey"] + d["RowKey"])) .Index(_indexNamer.BuildName(batch[0]["@timestamp"], mappingName) )) .Type(mappingName); if (_setPipeline) { bulkDescriptor.Pipeline(mappingName.ToLower()); } return(bulkDescriptor .MaxDegreeOfParallelism(5) .Size(_batchSize)); }); var observer = new BulkAllObserver( onNext: (b) => Interlocked.Increment(ref seenPages), onCompleted: () => tcs.SetResult(seenPages), onError: e => tcs.SetException(e)); observableBulk.Subscribe(observer); return(await tcs.Task.ConfigureAwait(false)); }
/// <summary> /// 批量插入. /// </summary> /// <typeparam name="T">对象.</typeparam> /// <param name="elasticClient">IElasticClient.</param> /// <param name="indexName">索引名称.</param> /// <param name="list">对象列表.</param> /// <returns>返回成功或失败.</returns> public static bool BulkAll <T>(IElasticClient elasticClient, IndexName indexName, IEnumerable <T> list) where T : class { const int size = 1000; var tokenSource = new CancellationTokenSource(); var observableBulk = elasticClient.BulkAll(list, f => f .MaxDegreeOfParallelism(8) .BackOffTime(TimeSpan.FromSeconds(10)) .BackOffRetries(2) .Size(size) .RefreshOnCompleted() .Index(indexName) .BufferToBulk((r, buffer) => r.IndexMany(buffer)) , tokenSource.Token); var countdownEvent = new CountdownEvent(1); Exception exception = null; void OnCompleted() { WriteLine("BulkAll Finished"); countdownEvent.Signal(); } var bulkAllObserver = new BulkAllObserver( onNext: response => { WriteLine($"Indexed {response.Page * size} with {response.Retries} retries"); }, onError: ex => { WriteLine("BulkAll Error : {0}", ex); exception = ex; countdownEvent.Signal(); }, onCompleted: OnCompleted); observableBulk.Subscribe(bulkAllObserver); countdownEvent.Wait(tokenSource.Token); if (exception != null) { WriteLine(Format, arg0: exception); return(false); } else { return(true); } }
public void CancelBulkAll() { var index = CreateIndexName(); var handle = new ManualResetEvent(false); var size = 1000; var pages = 1000; var seenPages = 0; var numberOfDocuments = size * pages; var documents = CreateLazyStreamOfDocuments(numberOfDocuments); //first we setup our cold observable var tokenSource = new CancellationTokenSource(); var observableBulk = Client.BulkAll(documents, f => f .MaxDegreeOfParallelism(8) .BackOffTime(TimeSpan.FromSeconds(10)) .BackOffRetries(2) .Size(size) .RefreshOnCompleted() .Index(index) , tokenSource.Token); //we set up an observer Exception ex = null; var bulkObserver = new BulkAllObserver( onError: (e) => OnError(ref ex, e, handle), onNext: (b) => Interlocked.Increment(ref seenPages) ); //when we subscribe the observable becomes hot observableBulk.Subscribe(bulkObserver); //we wait N seconds to see some bulks handle.WaitOne(TimeSpan.FromSeconds(3)); tokenSource.Cancel(); //we wait N seconds to give in flight request a chance to cancel handle.WaitOne(TimeSpan.FromSeconds(3)); if (ex != null && ex is not OperationCanceledException) { throw ex; } seenPages.Should().BeLessThan(pages).And.BeGreaterThan(0); var count = Client.Count(new CountRequest(index)); //var count = Client.Count<SmallObject>(f => f.Index(index)); count.Count.Should().BeLessThan(numberOfDocuments).And.BeGreaterThan(0); bulkObserver.TotalNumberOfFailedBuffers.Should().Be(0); bulkObserver.TotalNumberOfRetries.Should().Be(0); }
public bool BulkAll <T>(string indexName, IEnumerable <T> docs) where T : class { const int size = 1000; var tokenSource = new CancellationTokenSource(); var observableBulk = B2BElasticClient.BulkAll(docs, b => b .Index(indexName) .BackOffTime(TimeSpan.FromSeconds(10)) .BackOffRetries(2) .RefreshOnCompleted() .MaxDegreeOfParallelism(Environment.ProcessorCount) .Size(size) .BufferToBulk((r, buffer) => r.IndexMany(buffer)), tokenSource.Token ); var countdownEvent = new CountdownEvent(1); Exception exception = null; void OnCompleted() { Logger.Info("BulkAll Finished"); countdownEvent.Signal(); } var bulkAllObserver = new BulkAllObserver( onNext: response => { Logger.Info($"Indexed {response.Page * size} with {response.Retries} retries"); }, onError: ex => { Logger.Info("BulkAll Error : {0}", ex); exception = ex; countdownEvent.Signal(); }, OnCompleted); observableBulk.Subscribe(bulkAllObserver); countdownEvent.Wait(tokenSource.Token); if (exception != null) { Logger.Info("BulkHotelGeo Error : {0}", exception); return(false); } else { return(true); } }
/** * The internal implementation of `BulkAllObservable` is asynchronous, using the * https://docs.microsoft.com/en-us/dotnet/standard/events/observer-design-pattern[Observer Design Pattern] to enable observers to * be registered to take action when each bulk response is returned, an error has occurred, and when the `BulkAllObservable` has * finished. Whilst the internal implementation is asynchronous, you typically want to wait until all bulk indexing has finished before * continuing. The `Wait` method is a convenient shorthand to use for this, using a `ManualResetEvent` to block the current thread until * bulk indexing has finished or an error has occurred. * * ==== Advanced bulk indexing * * The `BulkAllObservable` helper exposes a number of methods to further control the process, such as * * * `BufferToBulk` to customize individual operations within the bulk request before it is dispatched to the server * * `RetryDocumentPredicate` to decide if a document that failed to be indexed should be retried * * `DroppedDocumentCallback` to determine what to do in the event a document is not indexed, even after retrying * * The following example demonstrates some of these methods, in addition to using a `BulkAllObserver` to subscribe to * the bulk indexing process and take some action on each successful bulk response, when an error occurs, and when * the process has finished. * * IMPORTANT: An observer such as `BulkAllObserver` should not throw exceptions from its interface implementations, such * as `OnNext` and `OnError`. Any exceptions thrown should be expected to go unhandled. In light of this, any exception * that occurs during the bulk indexing process should be captured and thrown outside of the observer, as demonstrated in the * example below. Take a look at the * https://docs.microsoft.com/en-us/dotnet/standard/events/observer-design-pattern-best-practices#handling-exceptions[Observer Design Pattern best practices] * on handling exceptions. */ public void AdvancedBulkIndexing() { //hide var people = new Person[] { }; var bulkAllObservable = client.BulkAll(people, b => b .BufferToBulk((descriptor, buffer) => //<1> Customise each bulk operation before it is dispatched { foreach (var person in buffer) { descriptor.Index <Person>(bi => bi .Index(person.Id % 2 == 0 ? "even-index" : "odd-index") //<2> Index each document into either even-index or odd-index .Document(person) ); } }) .RetryDocumentPredicate((bulkResponseItem, person) => //<3> Decide if a document should be retried in the event of a failure { return(bulkResponseItem.Error.Index == "even-index" && person.FirstName == "Martijn"); }) .DroppedDocumentCallback((bulkResponseItem, person) => //<4> If a document cannot be indexed this delegate is called { Console.WriteLine($"Unable to index: {bulkResponseItem} {person}"); })); var waitHandle = new ManualResetEvent(false); ExceptionDispatchInfo exceptionDispatchInfo = null; var observer = new BulkAllObserver( onNext: response => { // do something e.g. write number of pages to console }, onError: exception => { exceptionDispatchInfo = ExceptionDispatchInfo.Capture(exception); waitHandle.Set(); }, onCompleted: () => waitHandle.Set()); bulkAllObservable.Subscribe(observer); // <5> Subscribe to the observable, which will initiate the bulk indexing process waitHandle.WaitOne(); // <6> Block the current thread until a signal is received exceptionDispatchInfo?.Throw(); // <7> If an exception was captured during the bulk indexing process, throw it }
public void DisposingObservableCancelsBulkAll() { var index = CreateIndexName(); var handle = new ManualResetEvent(false); var size = 1000; var pages = 100; var seenPages = 0; var numberOfDocuments = size * pages; var documents = this.CreateLazyStreamOfDocuments(numberOfDocuments); //first we setup our cold observable var observableBulk = this._client.BulkAll(documents, f => f .MaxDegreeOfParallelism(8) .BackOffTime(TimeSpan.FromSeconds(10)) .BackOffRetries(2) .Size(size) .RefreshOnCompleted() .Index(index) ); //we set up an observer var bulkObserver = new BulkAllObserver( onError: (e) => { throw e; }, onCompleted: () => handle.Set(), onNext: (b) => Interlocked.Increment(ref seenPages) ); //when we subscribe the observable becomes hot observableBulk.Subscribe(bulkObserver); //we wait N seconds to see some bulks handle.WaitOne(TimeSpan.FromSeconds(3)); observableBulk.Dispose(); //we wait N seconds to give in flight request a chance to cancel handle.WaitOne(TimeSpan.FromSeconds(3)); seenPages.Should().BeLessThan(pages).And.BeGreaterThan(0); var count = this._client.Count <SmallObject>(f => f.Index(index)); count.Count.Should().BeLessThan(numberOfDocuments).And.BeGreaterThan(0); bulkObserver.TotalNumberOfFailedBuffers.Should().Be(0); bulkObserver.TotalNumberOfRetries.Should().Be(0); }
internal void Index(ElasticSearchIndex index) { var elasticClient = GetClient(); var observableBulkAll = elasticClient.BulkAll(index.Documents, b => b .Index(index.Name) .Type("entry") .BackOffRetries(5) .BackOffTime("30s") .RefreshOnCompleted(true) .MaxDegreeOfParallelism(4) .Size(30000) ); var bulkAllObserver = new BulkAllObserver( onError: (e) => { throw e; }, onCompleted: () => DoSomething(index.Name), onNext: (b) => DoSomethingElse((BulkAllResponse)b, index.Name) ); observableBulkAll.Subscribe(bulkAllObserver); }
/// <summary> /// Bulk sync to ES. /// </summary> /// <typeparam name="T">Document</typeparam> /// <param name="addedOrUpdatedDocuments">The documents to synchronize</param> /// <returns>the list of documents that could NOT be synced</returns> internal List <string> Execute <T>(List <T> addedOrUpdatedDocuments) where T : DocumentBase { var sw = new Stopwatch(); sw.Start(); Logger.Debug($"Got {addedOrUpdatedDocuments.Count} new/updated items!"); var failedIds = new List <string>(); var bulkAllObservable = _esClient.BulkAll(addedOrUpdatedDocuments, b => b.BufferToBulk((descriptor, buffer) => { foreach (T document in buffer) { if (document.Deleted) { descriptor.Delete <T>(doc => doc.Index(document.GetType().Name.ToLower()).Document(document)); Logger.Debug($"Item {document.Id} marked to be deleted!"); } else { descriptor.Index <T>(doc => doc.Index(document.GetType().Name.ToLower()).Document(document)); Logger.Debug($"Item {document.Id} marked to be upserted!"); } } }) .DroppedDocumentCallback((bulkResponseItem, document) => { Logger.Error($"Unable to index: {bulkResponseItem} {document}"); failedIds.Add(document.Id); }) .BackOffTime("1s") //how long to wait between retries .BackOffRetries(_appSettings.Get(BusinessConstants.ElasticSearchBulkSyncNoOfRetries, DefaultValues.ElasticSearchBulkSyncNoOfRetries)) //how many retries are attempted if a failure occurs .RefreshOnCompleted() //refresh the index after bulk insert .MaxDegreeOfParallelism(Environment.ProcessorCount) .ContinueAfterDroppedDocuments(true) .Size(_appSettings.Get(BusinessConstants.ElasticSearchSyncBatchSize, DefaultValues.ElasticSearchSyncBatchSize)));; var waitHandle = new ManualResetEvent(false); ExceptionDispatchInfo exceptionDispatchInfo = null; var observer = new BulkAllObserver( onNext: response => { Logger.Debug($"Written {response.Items.Count} in ES"); }, onError: exception => { exceptionDispatchInfo = ExceptionDispatchInfo.Capture(exception); waitHandle.Set(); }, onCompleted: () => waitHandle.Set()); bulkAllObservable.Subscribe(observer); //Subscribe to the observable, which will initiate the bulk indexing process waitHandle.WaitOne(TimeSpan.FromMinutes(1)); //Block the current thread until a signal is received exceptionDispatchInfo?.Throw(); //If an exception was captured during the bulk indexing process, throw it sw.Stop(); Logger.Debug("Finished in {ElapsedMilliseconds} ms", sw.ElapsedMilliseconds); return(failedIds); }
public static void Run(string endpoint, string username, string password, bool create = false) { var settings = new ConnectionSettings(new Uri(endpoint)).DefaultIndex("blogs").BasicAuthentication(username, password); var client = new ElasticClient(settings); if (create) { var resp = client.Indices.Create("blogs", cid => cid .Map <BlogIndexed>(m => m.AutoMap() .Properties(p => p.Keyword(kp => kp.Name(b => b.Author).Normalizer("lowercase"))) .Properties(p => p.Text(tp => tp.Name(b => b.Title).Fields(f => f.Text(tf => tf.Analyzer("ngram_lc").Name("ngram_lc"))))) .Properties(p => p.Keyword(tp => tp.Name(b => b.Tags).Fields(f => f.Text(tf => tf.Analyzer("ngram_lc").Name("ngram_lc")))))) .Settings(i => i.Setting("max_ngram_diff", 30) .Setting("max_result_window", 100000) .Setting("max_rescore_window", 100000) .Analysis(a => a.Analyzers(ana => ana.Custom("ngram_lc", c => c.Filters("lowercase").Tokenizer("ngram_tokenizer"))) .Tokenizers(t => t.NGram("ngram_tokenizer", n => n.MaxGram(30).MinGram(1).TokenChars(TokenChar.Letter, TokenChar.Digit))) .Normalizers(n => n.Custom("lowercase", cn => cn.Filters("lowercase")))))); if (!resp.IsValid) { Console.WriteLine("error creating index"); return; } } BlogContextFactory blogContextFactory = new BlogContextFactory(); using (var db = blogContextFactory.Create()) { var totalBlogs = db.Blogs.Where(b => b.BlogID > 0).Count(); Console.WriteLine($"total blogs: {totalBlogs}"); int lastBlogId = LAST_BLOG_ID; for (int i = 0; i < totalBlogs; i += BATCH_SIZE) { var blogs = db.Blogs.Where(b => b.BlogID > LAST_BLOG_ID).OrderBy(b => b.BlogID).Skip(i).Take(BATCH_SIZE) .GroupJoin(db.Posts.Where(p => p.IdType == GmGard.Models.ItemType.Blog), b => b.BlogID, p => p.PostId, (b, p) => new { blog = b, post = p.Count() }) .GroupJoin(db.TagsInBlogs.DefaultIfEmpty(), b => b.blog.BlogID, tib => tib.BlogID, (b, tib) => new { b.blog, tag = tib.Select(t => t.tag), b.post, }).ToList(); Console.WriteLine($"Send Items for {i} to {i + BATCH_SIZE - 1}"); var bulk = client.BulkAll(blogs.Select(b => new BlogIndexed { Id = b.blog.BlogID, Title = b.blog.BlogTitle, Content = b.blog.Content, Tags = b.tag.Select(t => t.TagName), CreateDate = b.blog.BlogDate, CategoryId = b.blog.CategoryID, Author = b.blog.Author, IsHarmony = b.blog.isHarmony, IsApproved = b.blog.isApproved, BlogVisit = b.blog.BlogVisit, PostCount = b.post, Rating = b.blog.Rating ?? 0, ImagePath = b.blog.ImagePath, IsLocalImg = b.blog.IsLocalImg, }), s => s // in case of 429 response, how long we should wait before retrying .BackOffTime(TimeSpan.FromSeconds(5)) // in case of 429 response, how many times to retry before failing .BackOffRetries(5) .Index <BlogIndexed>()); var waitHandle = new ManualResetEvent(false); var bulkAllObserver = new BulkAllObserver( onNext: bulkAllResponse => { // do something after each bulk request Console.WriteLine($"Done page {bulkAllResponse.Page} with retry {bulkAllResponse.Retries}"); }, onError: exception => { waitHandle.Set(); throw exception; }, onCompleted: () => { // do something when all bulk operations complete waitHandle.Set(); }); bulk.Subscribe(bulkAllObserver); waitHandle.WaitOne(); if (blogs.Count > 0) { lastBlogId = blogs.Last().blog.BlogID; } if (blogs.Count < BATCH_SIZE) { break; } } client.Indices.Refresh(Indices.Index("blogs")); Console.WriteLine($"last blogs: {lastBlogId}"); Console.ReadLine(); } }
public void IndexUsers(string usersPath, string badgesPath) { CreateUsersIndexIfNotExists(); _client.Indices.UpdateSettings(UsersIndex, u => u .IndexSettings(i => i .RefreshInterval("-1") ) ); var size = 1000; var seenPages = 0; var indexedDocs = 0; var totalDocs = 0; var handle = new ManualResetEvent(false); var users = StackOverflowData.GetUsers(usersPath); var observableBulk = _client.BulkAll(users, f => f .MaxDegreeOfParallelism(16) .BackOffTime(TimeSpan.FromSeconds(10)) .BackOffRetries(2) .Size(size) .RefreshOnCompleted() .Index(UsersIndex) ); Exception exception = null; var bulkObserver = new BulkAllObserver( onError: e => { exception = e; handle.Set(); }, onCompleted: () => handle.Set(), onNext: b => { Interlocked.Add(ref indexedDocs, b.Items.Count(i => i.IsValid)); Interlocked.Add(ref totalDocs, b.Items.Count); Interlocked.Increment(ref seenPages); Log.WriteLine($"indexed users page {seenPages}, {indexedDocs} out of {totalDocs}"); } ); var stopWatch = Stopwatch.StartNew(); observableBulk.Subscribe(bulkObserver); handle.WaitOne(); if (exception != null) { throw exception; } Log.WriteLine($"Time taken to index users: {stopWatch.Elapsed}"); // update user badges seenPages = 0; indexedDocs = 0; totalDocs = 0; handle = new ManualResetEvent(false); var badgeMetas = StackOverflowData.GetBadgeMetas(badgesPath); var observableBadgeBulk = _client.BulkAll(badgeMetas, f => f .Index <User>() .MaxDegreeOfParallelism(8) .Size(size) .BufferToBulk((bulk, badges) => { foreach (var badge in badges) { bulk.Update <User>(u => u .Script(s => s .Source(@"if (ctx._source.badges == null) { ctx._source.badges = [params.badge]; } else if (ctx._source.badges.any(b -> b.name == params.badge.name) == false) { ctx._source.badges.add(params.badge); }") .Params(d => d .Add("badge", badge.Badge) ) ) .Id(badge.UserId) .RetriesOnConflict(10) ); } }) .RefreshOnCompleted() ); bulkObserver = new BulkAllObserver( onError: e => { exception = e; handle.Set(); }, onCompleted: () => handle.Set(), onNext: b => { Interlocked.Add(ref indexedDocs, b.Items.Count(i => i.IsValid)); Interlocked.Add(ref totalDocs, b.Items.Count); Interlocked.Increment(ref seenPages); Log.WriteLine($"indexed badges page {seenPages}, {indexedDocs} out of {totalDocs}"); } ); stopWatch.Restart(); observableBadgeBulk.Subscribe(bulkObserver); handle.WaitOne(); if (exception != null) { throw exception; } Log.WriteLine($"Time taken to index badges: {stopWatch.Elapsed}"); _client.Indices.UpdateSettings(UsersIndex, u => u .IndexSettings(i => i .RefreshInterval("30s") ) ); }
public void IndexPosts(string path) { CreatePostsIndexIfNotExists(); _client.Indices.UpdateSettings(PostsIndex, u => u .IndexSettings(i => i .RefreshInterval("-1") ) ); var handle = new ManualResetEvent(false); var size = 1000; var posts = StackOverflowData.GetPosts(path); var observableBulk = _client.BulkAll(posts, f => f .MaxDegreeOfParallelism(Environment.ProcessorCount * 2) .BackOffTime(TimeSpan.FromSeconds(10)) .BackOffRetries(2) .Size(size) .BufferToBulk((bulk, buffer) => { foreach (var post in buffer) { if (post is Question question) { var item = new BulkIndexOperation <Question>(question); bulk.AddOperation(item); } else { var answer = (Answer)post; var item = new BulkIndexOperation <Answer>(answer); bulk.AddOperation(item); } } }) .RefreshOnCompleted() .Index(PostsIndex) ); var seenPages = 0; var indexedDocs = 0; var totalDocs = 0; Exception exception = null; var bulkObserver = new BulkAllObserver( onError: e => { exception = e; handle.Set(); }, onCompleted: () => handle.Set(), onNext: b => { Interlocked.Add(ref indexedDocs, b.Items.Count(i => i.IsValid)); Interlocked.Add(ref totalDocs, b.Items.Count); Interlocked.Increment(ref seenPages); Log.WriteLine($"indexed page {seenPages} of questions and answers, {indexedDocs} out of {totalDocs}"); } ); var stopWatch = Stopwatch.StartNew(); observableBulk.Subscribe(bulkObserver); handle.WaitOne(); if (exception != null) { throw exception; } Log.WriteLine($"time taken to index posts: {stopWatch.Elapsed}"); _client.Indices.UpdateSettings(PostsIndex, u => u .IndexSettings(i => i .RefreshInterval("30s") ) ); }