private static void IndexTweets(ElasticClient client) { Tokens tokens = Tokens.Create("BqJoOCvB0O2wFeD70Wsm7RadN", "ySYZt385xmwLxl3dNnD0vrcgkGZrZLq5HXFqmizqNhRqLMVjrw", "255309054-d1X1lhUNPionZ5xGghkdLgVlRp1RJz19xyXOUPyB", "g6ILV8HRdpkABLaJ4E16kBiDhyznYLhCVpUKR0vOd9zTl"); var disposable = tokens.Streaming .FilterAsObservable(track => "Trump") .OfType <StatusMessage>() .Select(MapTweet) .Buffer(5) .SelectMany(tweets => // Déclaration d'une opération Bulk client.BulkAll( // POCOs à indexer. tweets, // Instructions d'indexation. ctx => ctx // Nom de l'index. .Index("tweets-" + DateTime.Now.ToString("yyyy.MM.dd.HH")) // Type du mapping. .Type("tweets"))) .Subscribe( onNext: response => Console.WriteLine("Indexed a list of 5 tweet."), onError: error => Console.WriteLine("error {0}", error) ); Thread.Sleep(30 * 1000); Console.WriteLine("Press some key to stop"); Console.ReadKey(); disposable.Dispose(); }
public void InsertDocuments() { Console.WriteLine("Setting up a lazy xml files reader that yields packages..."); var packages = nugetDumpReader.GetPackages(); Console.Write("Indexing documents into Elasticsearch..."); var waitHandle = new CountdownEvent(1); var bulkAll = elasticClient.BulkAll(packages, b => b .BackOffRetries(2) .BackOffTime("30s") .RefreshOnCompleted(true) .MaxDegreeOfParallelism(4) .Size(1000) ); ExceptionDispatchInfo captureInfo = null; bulkAll.Subscribe(new BulkAllObserver( onNext: b => Console.Write("."), onError: e => { captureInfo = ExceptionDispatchInfo.Capture(e); waitHandle.Signal(); }, onCompleted: () => waitHandle.Signal() )); waitHandle.Wait(); captureInfo?.Throw(); Console.WriteLine($"Done."); }
public static void LoadCollectionInfoIndex <T>(List <T> dataToProcess, ElasticClient client, string indexName) where T : class { //clear the index before load var deleteResult = client.DeleteByQuery <T>(del => del .Query(q => q.QueryString(qs => qs.Query("*"))).Index(indexName)); var waitHandle = new CountdownEvent(1); var bulkAll = client.BulkAll(dataToProcess, b => b .Index(indexName) .BackOffRetries(2) .BackOffTime("30s") .RefreshOnCompleted(true) .MaxDegreeOfParallelism(4) .Size(10000)); bulkAll.Subscribe(new BulkAllObserver( onNext: (b) => { Console.Write("."); }, onError: (err) => { var x = err; throw err; }, onCompleted: () => waitHandle.Signal() )); waitHandle.Wait(); }
[U] public void Completes() { var cluster = Virtual.Elasticsearch .Bootstrap(2) .ClientCalls(c => c.FailAlways()) .StaticConnectionPool() .AllDefaults(); var settings = new ConnectionSettings(cluster.ConnectionPool, cluster.Connection).ApplyDomainSettings(); var client = new ElasticClient(settings); var index = CreateIndexName(); var size = 1000; var pages = 10; var seenPages = 0; var numberOfDocuments = size * pages; var documents = CreateLazyStreamOfDocuments(numberOfDocuments); var requests = 0; Exception ex = null; var tokenSource = new CancellationTokenSource(); var observableBulk = client.BulkAll(documents, f => f .MaxDegreeOfParallelism(1) .BulkResponseCallback(r => Interlocked.Increment(ref requests)) .BackOffTime(TimeSpan.FromMilliseconds(1)) .BackOffRetries(2) .Size(size) .RefreshOnCompleted() .Index(index) .BufferToBulk((r, buffer) => r.IndexMany(buffer)) , tokenSource.Token); try { observableBulk.Wait(TimeSpan.FromSeconds(30), b => { Interlocked.Increment(ref seenPages); }); } catch (Exception e) { ex = e; } ex.Should().NotBeNull(); var clientException = ex.Should().BeOfType <TransportException>().Subject; clientException.Message.Should() .StartWith("BulkAll halted after"); requests.Should().Be(3); // OnNext only called for successful batches. seenPages.Should().Be(0); }
public void BulkWriteData <T>(List <T> data, ElasticClient client) where T : class { client.BulkAll(data, b => b .Index(ElasticClientManager.GetIndexName <T>()) .BackOffTime("30s") .BackOffRetries(2) .MaxDegreeOfParallelism(Environment.ProcessorCount) .Size(100) ) .Wait(TimeSpan.FromMinutes(15), next => { Console.Write("."); }); }
private static BulkAllObservable <PriceItem> CreateBulkAllObservable(ElasticClient client, IEnumerable <PriceItem> prices) { return(client.BulkAll(prices, bulkAllDescriptor => bulkAllDescriptor .Index("prices") .BackOffTime(TimeSpan.FromSeconds(10)) .BackOffRetries(3) .MaxDegreeOfParallelism(Environment.ProcessorCount) .Size(1000) .RefreshOnCompleted() .ContinueAfterDroppedDocuments() .DroppedDocumentCallback(DroppedResponseCallback))); }
public async Task BulkAddOrUpdate <T>(IEnumerable <T> data, string indexName) where T : class { await CreateIndexIfNotExists(indexName); _client.BulkAll(data, b => b .Index(indexName) .BackOffTime("30s") .BackOffRetries(2) .RefreshOnCompleted() .MaxDegreeOfParallelism(Environment.ProcessorCount) .Size(1000) ).Wait(TimeSpan.FromMinutes(15), next => { }); }
public void BulkInsert(ElasticClient EsClient, List <SearchCricketData> documents) { var bulkAllObservable = EsClient.BulkAll(documents, b => b .Index("kabadibulk") .BackOffTime("30s") .BackOffRetries(2) .RefreshOnCompleted() .MaxDegreeOfParallelism(Environment.ProcessorCount) .Size(9999) ) .Wait(TimeSpan.FromMinutes(15), next => { }); }
public BulkAllObserver BulkAll(IEnumerable <Tag> tags, Action <BulkAllResponse> onNext) { var response = _client.BulkAll(tags, b => b // .Index("people") .BackOffTime("30s") .BackOffRetries(2) .RefreshOnCompleted() .MaxDegreeOfParallelism(Environment.ProcessorCount) .Size(1000) ) .Wait(TimeSpan.FromMinutes(15), onNext); return(response); }
public bool PostEvents([FromBody] IEnumerable <object> events) { var waitHandle = new CountdownEvent(1); var bulkAll = ElasticClient.BulkAll(events, e => e.Size(1000)); bulkAll.Subscribe(new BulkAllObserver( onNext: (b) => { Console.Write("."); }, onError: (e) => { throw e; }, onCompleted: () => waitHandle.Signal() )); waitHandle.Wait(); return(true); }
public static void DataSeed(ElasticClient client) { var newsList = GetAllNews(); var waitHandle = new CountdownEvent(1); var bulkAll = client.BulkAll <News>(newsList, b => b .BackOffRetries(2) //ilk denememiz basarisiz oldugunda yeniden deneme sayisi .BackOffTime(TimeSpan.FromSeconds(15)) // deneme basarisizliginda bekleme suresi .RefreshOnCompleted(true) //tamamlandıktan sonra refresh ile okuma yetkisi verilmis olur. .MaxDegreeOfParallelism(4) .Size(100)); //bulk insert boyutu. 100er 100er yapacak. bulkAll.Subscribe(new BulkAllObserver( onNext: (b) => { Console.WriteLine("aktarım basliyor"); }, onError: (e) => { Console.WriteLine("Hata : {0}", e.Message); }, onCompleted: () => waitHandle.Signal())); waitHandle.Wait(); }
public dynamic AddDocuments(DataLoad dataLoad) { int batch = 0; bool newIndex = false; try { var qnas = ExcelHelper.WorkSheetToQNA(dataLoad.Stream); List <QNA> modifiedQNA = qnas .Select(i => new QNA { Question = i.Question, Answer = string.IsNullOrEmpty(i.Answer) ? string.Empty : ClearDataWithAnalyzer(i.Answer), Type = DocumentType.qna.ToString() }) .ToList(); var addedBotId = string.IsNullOrEmpty(dataLoad.BotName) ? false : UpdateBotIds(dataLoad); if (!_elasticClient.Indices.Exists(dataLoad.Index).Exists) { newIndex = true; _elasticClient.Indices.Create(dataLoad.Index); } if (addedBotId && newIndex) { var nameQuestions = AddNamingQuestionsToIndex(dataLoad); var chitChat = AddChitChat(dataLoad); modifiedQNA.AddRange(nameQuestions); modifiedQNA.AddRange(chitChat); } _elasticClient.BulkAll(modifiedQNA, b => b.Index(dataLoad.Index)) .Wait(TimeSpan.FromMinutes(30), next => { batch += 1; }); } catch (Exception ex) { return(new { message = ex.Message, metrics = batch, newIndex }); } return(new { message = loadSuccess, metrics = batch, newIndex }); }
public bool CreateIndex() { List <ProductModel> productData; var seenPages = 0; IndexName indexName = "productdatanewdynamic"; CancellationTokenSource tokenSource = new CancellationTokenSource(); ConcurrentBag <BulkAllResponse> bulkAllResponses = new ConcurrentBag <BulkAllResponse>(); ConcurrentBag <dynamic> deadLetterQueue = new ConcurrentBag <dynamic>(); //List<dynamic> dynamicProductData; // deserialize JSON directly from a file using (StreamReader files = File.OpenText(@"C:\Users\akshay.badhiye\source\repos\Znode_ElasticSearch_POC\Znode_ElasticSearch_POC\Data\Data.json")) { JsonSerializer serializer = new JsonSerializer(); //productData = (List<dynamic>)serializer.Deserialize(files, typeof(List<dynamic>)); productData = (List <ProductModel>)serializer.Deserialize(files, typeof(List <ProductModel>)); } var a = Environment.ProcessorCount; var observableBulk = elasticClient.BulkAll(productData, f => f .MaxDegreeOfParallelism(Environment.ProcessorCount) .DroppedDocumentCallback((item, product) => deadLetterQueue.Add(product)) .BackOffTime(TimeSpan.FromMilliseconds(10)) .Size(500) .RefreshOnCompleted() .Index(indexName), tokenSource.Token); try { observableBulk.Wait(TimeSpan.FromSeconds(30), b => { bulkAllResponses.Add(b); Interlocked.Increment(ref seenPages); }); } catch (Exception e) { Debug.Write(e); } return(true); }
public IActionResult IndexBulkBetter() { var results = new string[] { }; // https://www.elastic.co/guide/en/elasticsearch/client/net-api/current/indexing-documents.html#_multiple_documents_with_bulkallobservable_helper _client.BulkAll(Data.AuthorBooks, b => b.Index("authorbooks") .BackOffTime("30s") // time to wait between retries .BackOffRetries(2) // max amount to retry .RefreshOnCompleted() // make sure to index all documents before client can read said documents .Size(200)) .Wait(TimeSpan.FromMinutes(15), response => // this will block main thread (not in core though) { results = response.Items.Select(r => r.Result).ToArray(); }); // items per bulk request return(Ok(results)); }
public async Task <bool> AddDocuments(List <T> documents) { var waitHandle = new CountdownEvent(1); var bulkAll = _esClient.BulkAll(documents, b => b.Index(_indexValue) .BackOffRetries(2) .BackOffTime("30s") .RefreshOnCompleted(true) .MaxDegreeOfParallelism(4) .Size(1000)); bulkAll.Subscribe(new BulkAllObserver( onNext: (b) => { Console.Write("."); }, onError: (e) => { throw e; }, onCompleted: () => waitHandle.Signal() )); waitHandle.Wait(); return(await Task.FromResult(true)); }
public Boolean CreateBulkDocument(String index, String type, List <Dictionary <String, Object> > obj) { var descriptor = new BulkDescriptor(); descriptor.CreateMany <Dictionary <String, Object> >(obj, (bd, q) => bd.Id(q["id"].ToString()).Index(index).Type(type)); var bulkResponse = _elasticClient.Bulk(descriptor); var response = _elasticClient.BulkAll(obj, idx => idx .Index(index) .Type(type) .Refresh(Refresh.True) ); if (bulkResponse.ServerError != null && bulkResponse.ServerError.ToString().Length > 0) { Console.WriteLine("Error : {0}", bulkResponse.ServerError.ToString()); Console.WriteLine("Error Content : {0}", Newtonsoft.Json.JsonConvert.SerializeObject(obj)); } return(bulkResponse.IsValid); }
public static void BulkInsert <T>(ElasticClient client, string index, ICollection <T> items) where T : class { const int pageSize = 1000; string typeName = typeof(T).Name; Utils.Log("Performing bulk insert of {0} {1} into {2}.", items.Count(), typeName, index); using (var waitHandle = new CountdownEvent(1)) { int totalPages = items.Count / pageSize; var bulkAll = client.BulkAll(items, b => b .Index(index.ToLower()) .Type(typeName) .BackOffRetries(2) .BackOffTime("10s") .RefreshOnCompleted(true) .MaxDegreeOfParallelism(4) .Size(pageSize) ); bulkAll.Subscribe(new BulkAllObserver( onNext: (b) => { Utils.Log(" Working - page {0} of {1}...", b.Page, totalPages); }, onError: (e) => { Utils.Log("Exception: " + e.Message); waitHandle.Signal(); throw e; }, onCompleted: () => { Utils.Log("Index complete: {0} documents indexed", items.Count()); waitHandle.Signal(); } )); waitHandle.Wait(); } }
public static void BulkAll(ElasticClient client) { var people = new[] { new Person { Id = 1, FirstName = "Martijn", LastName = "Laarman" }, new Person { Id = 2, FirstName = "Stuart", LastName = "Cam" }, new Person { Id = 3, FirstName = "Russ", LastName = "Cam" } }; var bulkAllObservable = client.BulkAll(people, b => b .Index("people") .BackOffTime("30s") .BackOffRetries(2) .RefreshOnCompleted() .MaxDegreeOfParallelism(Environment.ProcessorCount) .Size(3) ) .Wait(TimeSpan.FromMinutes(15), next => { // do something e.g. write number of pages to console }); }
public static void BulkInsert <T>(ElasticClient esClient, string index, ICollection <T> readings) where T : class { const int pageSize = 1000; // Handle type-specific indexing for polymorphic lists of objects. :) var types = readings.GroupBy(x => x.GetType()) .Select(x => new { Type = x.Key, Items = x.Select(r => r).ToList() }); foreach (var typeBatch in types) { Utils.Log("Performing bulk insert of {0} {1} into {2}.", typeBatch.Items.Count(), typeBatch.Type.Name, index); var waitHandle = new CountdownEvent(1); int totalPages = readings.Count / pageSize; var bulkAll = esClient.BulkAll(typeBatch.Items, b => b .Index(index) .Type(typeBatch.Type.Name) .BackOffRetries(2) .BackOffTime("30s") .RefreshOnCompleted(true) .MaxDegreeOfParallelism(4) .Size(pageSize) ); bulkAll.Subscribe(new BulkAllObserver( onNext: (b) => { Utils.Log(" Working - page {0} of {1}...", b.Page, totalPages); }, onError: (e) => { throw e; }, onCompleted: () => waitHandle.Signal() )); waitHandle.Wait(); Utils.Log("Index complete: {0} documents indexed", typeBatch.Items.Count()); } }
[I] public async Task BadBulkRequestFeedsToOnError() { var index = CreateIndexName(); var documents = await CreateIndexAndReturnDocuments(index); var seenPages = 0; var badUris = new[] { new Uri("http://test.example:9201"), new Uri("http://test.example:9202") }; var pool = new StaticConnectionPool(badUris); var badClient = new ElasticClient(new ConnectionSettings(pool)); var observableBulk = badClient.BulkAll(documents, f => f .MaxDegreeOfParallelism(8) .BackOffTime(TimeSpan.FromSeconds(10)) .BackOffRetries(2) .Size(Size) .RefreshOnCompleted() .Index(index) ); Exception ex = null; var handle = new ManualResetEvent(false); using (observableBulk.Subscribe( b => Interlocked.Increment(ref seenPages), e => { ex = e; handle.Set(); }, () => handle.Set() )) { handle.WaitOne(TimeSpan.FromSeconds(60)); seenPages.Should().Be(0); var clientException = ex.Should().NotBeNull().And.BeOfType <ElasticsearchClientException>().Subject; clientException.Message.Should().StartWith("BulkAll halted after attempted bulk failed over all the active nodes"); } }
private static async Task Main() { var aggs = new AggregationDictionary { { "startDates", new TermsAggregation("startDates") { Field = "startedOn" } }, { "endDates", new DateHistogramAggregation("endDates") { Field = "endedOn" } } }; var a = new SearchRequest() { From = 10, Size = 20, Query = new QueryContainer(new MatchAllQuery()), Aggregations = aggs, PostFilter = new QueryContainer(new TermQuery { Field = "state", Value = "Stable" }) }; var client = new ElasticClient(); client.Update <Person>("a", d => d.Index("test").Script(s => s.Source("script").Params(new Dictionary <string, object?> { { "null", new Person { FirstName = null, LastName = "test-surname" } } }))); var people = new List <Person>() { new Person { FirstName = "Steve", LastName = "Gordon" }, new Person { FirstName = "Steve", LastName = "Gordon" }, new Person { FirstName = "Steve", LastName = "Gordon" }, new Person { FirstName = "Steve", LastName = "Gordon" }, new Person { FirstName = "Steve", LastName = "Gordon" }, }; //using var bulk = client.BulkAll(people, r => r.Index("testing-v7")); //var result = bulk.Wait(TimeSpan.FromSeconds(60), a => { Console.WriteLine(a.Items.Count); }); //var a1 = result.TotalNumberOfRetries; //var b1 = result.TotalNumberOfFailedBuffers; using var bulk2 = client.BulkAll(people, r => r); var result2 = bulk2.Wait(TimeSpan.FromSeconds(60), a => { Console.WriteLine(a.Items.Count); }); var a12 = result2.TotalNumberOfRetries; var b12 = result2.TotalNumberOfFailedBuffers; //var responseBulk = client.Bulk(new BulkRequest //{ // Operations = new List<IBulkOperation> //{ // new BulkIndexOperation<Person>(new Person()) { Index = "people" } , // new BulkIndexOperation<Person>(new Person()) { Index = "people", IfSequenceNumber = -1, IfPrimaryTerm = 0 } //} //}); var response = client.Index(new Person(), e => e.Index("test")); var settingsResponse = await client.Indices.CreateAsync("a", i => i.Settings(s => s.Analysis(a => a.TokenFilters(tf => tf .Shingle("my-shingle", s => s.MinShingleSize(2)) .Snowball("my_snowball", s => s.Version("v1")))))); //var c1 = new ElasticClient(new ConnectionSettings(new Uri("https://azure.es.eastus.azure.elastic-cloud.com:9243")).BasicAuthentication("a", "b").ThrowExceptions()); //var r1 = await c1.PingAsync(); #pragma warning disable IDE0039 // Use local function Func <BoolQueryDescriptor <Person>, IBoolQuery> test = b => b.Name("thing");
static void Main(string[] args) { Console.WriteLine("Hello World!"); SqlSugarClient db = new SqlSugarClient( new ConnectionConfig() { ConnectionString = "server=192.168.13.128;user=root;password=173355872;database=study", DbType = DbType.MySql, //设置数据库类型 IsAutoCloseConnection = true, //自动释放数据务,如果存在事务,在事务结束后释放 InitKeyType = InitKeyType.Attribute //从实体特性中读取主键自增列信息 }); var client = new ElasticClient(new ConnectionSettings(new Uri("http://192.168.13.128:9200")).DefaultIndex("novel")); var tmp = client.IndexDocument(new Novel() { Content = "天之苍苍,其正色邪", Id = 5, KeyWord = "天之苍苍,其正色邪" }); tmp.ToString(); //搜索 //var documents = client.Search<Novel>(s => s.Query(q => q.Match(m => m.Field(f => f.Content).Query("雎鸠")))).Documents; //documents.ToString(); var id = 1294571; var tokenSource = new System.Threading.CancellationTokenSource(); foreach (var item in new List <string> { "从零开始", "仙逆", "宇宙巨校闪级生", "修神外传", "凡人修仙传", "将夜", "我欲封天", "武神空间", "永恒圣王", "求魔", "遮天" }) { var models = new List <Novel>(); var xn = File.ReadAllText($"{item}.txt"); var i = 0; try { while (true) { var txt = xn.Substring(i * 100, 100); if (txt.Length > 0) { models.Add(new Novel() { Content = txt, Id = ++id }); i++; if (models.Count >= 2000) { var builk = client.BulkAll(models, f => f, tokenSource.Token); builk.Wait(TimeSpan.FromSeconds(5), s => Console.WriteLine(s.Page)); //db.Insertable(models).ExecuteCommand(); models.Clear(); } } else { break; } Console.WriteLine($"{i * 100 * 100 / xn.Length}%"); } } catch (Exception) { } if (models.Count > 0) { client.BulkAll(models, f => f).Wait(TimeSpan.FromSeconds(5), s => Console.WriteLine(s.Page)); } //db.Insertable(models).ExecuteCommand(); } }
public IActionResult CreateIndex() { var indexName = "productsearch"; var _connectionSettings = new ConnectionSettings(new Uri("http://*****:*****@Id"); var lists = products.Select(product => { var model = new ProductModel(); model.Id = product.Id; model.Name = product.Name; model.FullDescription = string.IsNullOrEmpty(product.FullDescription) ? "" : _htmlEncoder.Encode(product.FullDescription); model.ShortDescription = product.ShortDescription; model.Tags = GetProductTagByProductId(product.Id); model.ReviewCount = new Random().Next(1, 10000000); var tokens = client.Indices.Analyze(a => a .Analyzer("ik_max_word") .Text(product.Name) ); var ts = tokens.Tokens.Select(w => w.Token); var list = new List <string>(); if (ts.Any()) { list.AddRange(ts.ToArray()); } model.Suggest = new CompletionField { Input = list, Weight = new Random().Next(1, 10000000) }; return(model); }); //var lists = new List<ProductModel>(); //foreach (var product in products) //{ // var model = new ProductModel(); // model.Id = product.Id; // model.Name = product.Name; // model.FullDescription = string.IsNullOrEmpty(product.FullDescription) ? "" : _htmlEncoder.Encode(product.FullDescription); // model.ShortDescription = product.ShortDescription; // model.Tags = product.ProductProductTagMappings.Any() ? product.ProductProductTagMappings.Select(s => s.ProductTag.Name).ToArray() : new List<string>().ToArray(); // model.ReviewCount = product.ProductReviews.Count(); // lists.Add(model); //} var waitHandle = new CountdownEvent(1); var bulkAll = client.BulkAll(lists, b => b .Index(indexName) .BackOffRetries(2) .BackOffTime("30s") .RefreshOnCompleted(true) .MaxDegreeOfParallelism(4) .Size(1000) ); bulkAll.Subscribe(new BulkAllObserver( onNext: b => Console.Write("."), onError: e => throw e, onCompleted: () => waitHandle.Signal() )); waitHandle.Wait(TimeSpan.FromMinutes(30)); return(View()); }
static void Main(string[] args) { var _client = GetClient(); ////设置最大返回记录条数 默认:10000 ////https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html#dynamic-index-settings //var updateIndexSettingsResponse = _client.Indices.UpdateSettings("productsearch", u => u // .IndexSettings(di => di // .Setting("index.max_result_window", 20000) // ) //); var searchResponse = _client.Search <ProductModel>(s => s .From(0) .Size(15000) .Query(q => q .Match(m => m .Field(f => f.Name) .Query("水果") ) ) ); var productList = searchResponse.Documents; foreach (var item in productList) { Console.WriteLine($"{item.Id}-----{item.Name}"); } Console.WriteLine("完成!!!!"); Console.ReadLine(); return; List <ProductModel> products = new List <ProductModel>(); var keywords = new List <string>() { "java", "职业", "水果" }; foreach (var keyword in keywords) { //var keyword = "java"; var pageIndex = 1; var totalPage = 1; var IsCalculatedPage = false; do { var url = $"https://search.jd.com/Search?keyword={keyword}&page={pageIndex}"; var web = new HtmlWeb(); web.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36"; var doc = web.Load(url); var listNode = doc.GetElementbyId("J_goodsList"); var totalPageNode = doc.DocumentNode.SelectSingleNode("//span[@class='fp-text']"); if (totalPageNode != null) { if (!IsCalculatedPage) { var pageInfo = totalPageNode.InnerText; if (!string.IsNullOrEmpty(pageInfo)) { pageInfo = pageInfo.Replace("\t", "").Replace("\n", ""); totalPage = int.Parse(pageInfo.Substring(pageInfo.IndexOf("/") + 1)); IsCalculatedPage = true; } } } if (listNode != null) { var list = listNode.SelectNodes("./ul/li"); foreach (var li in list) { ProductModel product = new ProductModel(); product.Id = products.Count + 1; var imgNode = li.SelectSingleNode("./div[@class='gl-i-wrap']//div[@class='p-img']/a/img"); if (imgNode != null) { var imageUrl = imgNode.GetAttributeValue("data-lazy-img", ""); product.ImageUrl = imageUrl; } var priceNode = li.SelectSingleNode("./div[@class='gl-i-wrap']//div[@class='p-price']/strong/i"); if (priceNode != null) { var price = priceNode.InnerText; if (!string.IsNullOrEmpty(price)) { product.Price = decimal.Parse(price); } } var nameNode = li.SelectSingleNode("./div[@class='gl-i-wrap']//div[contains(@class,'p-name')]/a/em"); if (nameNode != null) { var name = nameNode.InnerText; product.Name = name; Console.WriteLine(name); } var shopNode = li.SelectSingleNode("./div[@class='gl-i-wrap']/div[@class='p-shopnum']/a"); if (shopNode != null) { var publisher = shopNode.InnerText; product.Publisher = publisher; } products.Add(product); } } pageIndex++; } while (pageIndex <= totalPage); } var indexName = "productsearch"; var _connectionSettings = new ConnectionSettings(new Uri("http://localhost:9200")) //.DefaultIndex(indexName) .DefaultMappingFor <ProductModel>(i => i .IndexName(indexName) ) //.EnableDebugMode() .PrettyJson() .RequestTimeout(TimeSpan.FromMinutes(2)); var client = new ElasticClient(_connectionSettings); if (client.Indices.Exists(indexName).Exists) { //client.Indices.Delete(indexName); } else { Dictionary <string, object> container = new Dictionary <string, object>(); container.Add("index.max_result_window", 20000); //不存在索引创建 IIndexState indexState = new IndexState { Settings = new IndexSettings(container) { NumberOfReplicas = 0, //副本数量 NumberOfShards = 2 //分片数量 } }; var createIndexResonse = client.Indices.Create(indexName, i => i .InitializeUsing(indexState) .Map <ProductModel>(p => p .AutoMap() .Properties(props => props .Number(t => t .Name(p => p.Id) .Type(NumberType.Integer) ) .Keyword(t => t .Name(p => p.ImageUrl) ) .Text(t => t .Name(p => p.Name) .Analyzer("ik_max_word") ) .Text(t => t .Name(p => p.Publisher) .Analyzer("ik_max_word") ) .Number(t => t .Name(p => p.Price) .Type(NumberType.Float) ) .Number(t => t .Name(p => p.OldPrice) .Type(NumberType.Float) ) ) )); if (!createIndexResonse.IsValid) { } } var waitHandle = new CountdownEvent(1); var bulkAll = client.BulkAll(products, b => b.Index(indexName) .BackOffRetries(2) .BackOffTime("30s") .RefreshOnCompleted(true) .MaxDegreeOfParallelism(4) .Size(100) ); bulkAll.Subscribe(new BulkAllObserver( onNext: b => Console.Write("."), onError: e => throw e, onCompleted: () => waitHandle.Signal() )); waitHandle.Wait(TimeSpan.FromMinutes(30)); //var settings = new ConnectionSettings(new Uri("http://localhost:9200")).DefaultIndex("people"); // var client = new ElasticClient(settings); // var analyzeResponse = client.Indices.Analyze(a => a.Analyzer("standard").Text("F# is THE SUPERIOR language :)")); // foreach (var analyzeToken in analyzeResponse.Tokens) // { // Console.WriteLine($"{analyzeToken.Token}"); // } // var person = new Person // { // Id = 1, // FirstName = "Martijn", // LastName = "Laarman wu" // }; // var indexResponse = client.IndexDocument(person); // var person2 = new Person // { // Id = 2, // FirstName = "Xiao ad a", // LastName = "Hongwu ddd f hong" // }; // var indexResponse2 = client.IndexDocument(person2); // var person3 = new Person // { // Id = 3, // FirstName = "Wang wo", // LastName = "Baoyi bb b xiao" // }; // var indexResponse3 = client.IndexDocument(person3); // var person4 = new Person // { // Id = 4, // FirstName = "Liu", // LastName = "Chenxu Xiao" // }; // var indexResponse4 = client.IndexDocument(person4); // var person5 = new Person // { // Id = 5, // FirstName = "Xiao", // LastName = "Chenxu xu hong" // }; // var indexResponse5 = client.IndexDocument(person5); // //var query = new Nest.SearchDescriptor<Person>(); // //var ddd = query.Query(q => q.QueryString(t => t.Fields(f => f.Field(obj => obj.FirstName).Field(obj => obj.LastName)).Query("ao"))); // //var dddd = client.Search<Person>(s => query); // var searchResponse = client.Search<Person>(s => s // .From(0) // .Size(10) // .Query(q => q // .Match(m => m // .Field(f => f.LastName) // .Query("xu").Fuzziness(Fuzziness.Auto) // ) // ) // ); // var people = searchResponse.Documents; // var searchResponse2 = client.Search<Person>(s => s // //.Size(0) // .Query(q => q // .Match(m => m // .Field(f => f.FirstName) // .Query("Xiao") // ) // ) // .Aggregations(a => a // .Terms("lastname", ta => ta // .Field(f => f.LastName) // ) // ) //); // var termsAggregation = searchResponse2.Aggregations.Terms("lastname"); // var searchRequest = new SearchRequest<Person>(Indices.All) // { // From = 0, // Size = 10, // Query = new MatchQuery // { // Field = Infer.Field<Person>(f => f.FirstName), // Query = "Martijn" // } // }; // var searchResponse3 = client.Search<Person>(searchRequest); Console.WriteLine(""); Console.WriteLine("----------------------------------------------"); Console.WriteLine("任务完成......"); Console.WriteLine("----------------------------------------------"); Console.ReadLine(); }
//const int MAX_URI_LEN = 450; static void Main(string[] args) { ThreadPool.SetMinThreads(1000, 1000); IConfiguration Configuration = new ConfigurationBuilder() .AddJsonFile("appsettings.json", optional: true, reloadOnChange: true) .Build(); if (args.Length > 0 && args[0] == "indexer") { var esConfig = Configuration.GetSection("ESConnection"); var settings = new ConnectionSettings(new System.Uri(esConfig["Host"])) .DefaultIndex("uri").BasicAuthentication(esConfig["Username"], esConfig["Password"]); var elasticClient = new ElasticClient(settings); //var uri = new Uri { AbsoluteUri = "key1", BrowserContent = "<fkjaslkdf>a sdlfjlasjdflsM</asdf> lkafsjiw fasd fjl<a></a>", CrawledAt = DateTime.UtcNow }; //var indexResponse = elasticClient.IndexDocument(uri); var db = CrawlerContext.Create(Configuration.GetConnectionString("CrawlerDatabase")); Console.WriteLine($"Fetching db data..."); var uriDocuments = db.Uri.Where(o => o.Content != null) .OrderBy(o => o.Id) .Select(o => new UriDocument() { AbsoluteUri = o.AbsoluteUri, BrowserHtml = o.BrowserContent ?? o.Content, Id = o.Id, OriginalUriString = o.OriginalString, }) .ToList(); Console.WriteLine($"Parsing html and generating doc text..."); var htmlDoc = new HtmlDocument(); foreach (var doc in uriDocuments) { htmlDoc.LoadHtml(doc.BrowserHtml); var htmlBody = htmlDoc.DocumentNode.SelectSingleNode("//html"); doc.BrowserText = htmlBody.InnerText.Trim(); } Console.WriteLine($"Saving to ES..."); var bulkAllObservable = elasticClient.BulkAll(uriDocuments, b => b .Index("uri") // how long to wait between retries .BackOffTime("30s") // how many retries are attempted if a failure occurs .BackOffRetries(2) // refresh the index once the bulk operation completes .RefreshOnCompleted() // how many concurrent bulk requests to make .MaxDegreeOfParallelism(Environment.ProcessorCount) // number of items per bulk request .Size(1000) //.RetryDocumentPredicate((item, person) => //{ // // decide if a document should be retried in the event of a failure // return item.Error.Index == "even-index" && person.FirstName == "Martijn"; //}) .DroppedDocumentCallback((item, uri) => { // if a document cannot be indexed this delegate is called Console.WriteLine($"Unable to index: {item} {uri}"); }) ) // Perform the indexing, waiting up to 15 minutes. // Whilst the BulkAll calls are asynchronous this is a blocking operation .Wait(TimeSpan.FromMinutes(15), next => { // do something on each response e.g. write number of batches indexed to console Console.WriteLine($"ES Bulked Items: {next.Items.Count}"); Console.WriteLine( $"{next.Items.GroupBy(o => o.Result).Select(o => o.Key + ":" + o.Count()).Aggregate((o, n) => o + " " + n)}"); }); //Console.ReadKey(); return; } //var db = CrawlerContext.Create(Configuration.GetConnectionString("CrawlerDatabase")); //var u1 = new Uri { AbsoluteUri = "key1" }; //var u2 = new Uri { AbsoluteUri = "key2" }; //var u3 = new Uri { AbsoluteUri = "key1" }; //var list = new List<Uri> { u1, u2, u3 }; ////db.Uri.Add(u1); ////db.Uri.Add(u2); ////db.SaveChanges(); ////db.BulkInsertOrUpdate(list); //db.BulkMerge(list, options => options.ColumnPrimaryKeyExpression = o => o.AbsoluteUri); const string startPage = "https://www.domain.com/"; //var browserWebCrawler = new BrowserWebCrawler(); //var crawlResult = browserWebCrawler.CrawlPage(new CrawlPlan() {AbsoluteUri = "https://www.playerauctions.com/wow-account"}); //var staticWebCrawler = new StaticWebCrawler(); //var crawlResult = staticWebCrawler.CrawlPage(new CrawlPlan() { AbsoluteUri = "https://www.playerauctions.com/wow-gold/" }); var crawler = new LightningCrawler(Configuration.GetConnectionString("CrawlerDatabase"), startPage, new string[] { "www.domain.com", "subdomain1.domain.com", "subdomain2.domain.com", }, 20, 2, 0); crawler.Run(); }
public static void Run(string endpoint, string username, string password, bool create = false) { var settings = new ConnectionSettings(new Uri(endpoint)).DefaultIndex("blogs").BasicAuthentication(username, password); var client = new ElasticClient(settings); if (create) { var resp = client.Indices.Create("blogs", cid => cid .Map <BlogIndexed>(m => m.AutoMap() .Properties(p => p.Keyword(kp => kp.Name(b => b.Author).Normalizer("lowercase"))) .Properties(p => p.Text(tp => tp.Name(b => b.Title).Fields(f => f.Text(tf => tf.Analyzer("ngram_lc").Name("ngram_lc"))))) .Properties(p => p.Keyword(tp => tp.Name(b => b.Tags).Fields(f => f.Text(tf => tf.Analyzer("ngram_lc").Name("ngram_lc")))))) .Settings(i => i.Setting("max_ngram_diff", 30) .Setting("max_result_window", 100000) .Setting("max_rescore_window", 100000) .Analysis(a => a.Analyzers(ana => ana.Custom("ngram_lc", c => c.Filters("lowercase").Tokenizer("ngram_tokenizer"))) .Tokenizers(t => t.NGram("ngram_tokenizer", n => n.MaxGram(30).MinGram(1).TokenChars(TokenChar.Letter, TokenChar.Digit))) .Normalizers(n => n.Custom("lowercase", cn => cn.Filters("lowercase")))))); if (!resp.IsValid) { Console.WriteLine("error creating index"); return; } } BlogContextFactory blogContextFactory = new BlogContextFactory(); using (var db = blogContextFactory.Create()) { var totalBlogs = db.Blogs.Where(b => b.BlogID > 0).Count(); Console.WriteLine($"total blogs: {totalBlogs}"); int lastBlogId = LAST_BLOG_ID; for (int i = 0; i < totalBlogs; i += BATCH_SIZE) { var blogs = db.Blogs.Where(b => b.BlogID > LAST_BLOG_ID).OrderBy(b => b.BlogID).Skip(i).Take(BATCH_SIZE) .GroupJoin(db.Posts.Where(p => p.IdType == GmGard.Models.ItemType.Blog), b => b.BlogID, p => p.PostId, (b, p) => new { blog = b, post = p.Count() }) .GroupJoin(db.TagsInBlogs.DefaultIfEmpty(), b => b.blog.BlogID, tib => tib.BlogID, (b, tib) => new { b.blog, tag = tib.Select(t => t.tag), b.post, }).ToList(); Console.WriteLine($"Send Items for {i} to {i + BATCH_SIZE - 1}"); var bulk = client.BulkAll(blogs.Select(b => new BlogIndexed { Id = b.blog.BlogID, Title = b.blog.BlogTitle, Content = b.blog.Content, Tags = b.tag.Select(t => t.TagName), CreateDate = b.blog.BlogDate, CategoryId = b.blog.CategoryID, Author = b.blog.Author, IsHarmony = b.blog.isHarmony, IsApproved = b.blog.isApproved, BlogVisit = b.blog.BlogVisit, PostCount = b.post, Rating = b.blog.Rating ?? 0, ImagePath = b.blog.ImagePath, IsLocalImg = b.blog.IsLocalImg, }), s => s // in case of 429 response, how long we should wait before retrying .BackOffTime(TimeSpan.FromSeconds(5)) // in case of 429 response, how many times to retry before failing .BackOffRetries(5) .Index <BlogIndexed>()); var waitHandle = new ManualResetEvent(false); var bulkAllObserver = new BulkAllObserver( onNext: bulkAllResponse => { // do something after each bulk request Console.WriteLine($"Done page {bulkAllResponse.Page} with retry {bulkAllResponse.Retries}"); }, onError: exception => { waitHandle.Set(); throw exception; }, onCompleted: () => { // do something when all bulk operations complete waitHandle.Set(); }); bulk.Subscribe(bulkAllObserver); waitHandle.WaitOne(); if (blogs.Count > 0) { lastBlogId = blogs.Last().blog.BlogID; } if (blogs.Count < BATCH_SIZE) { break; } } client.Indices.Refresh(Indices.Index("blogs")); Console.WriteLine($"last blogs: {lastBlogId}"); Console.ReadLine(); } }
private void bulkInser(ElasticClient client, List <AP_SP_SampleTypeExDatum> models) { var indexResponse = client.BulkAll(models, default); }
/// <summary> /// Bulk sync to ES. /// </summary> /// <typeparam name="T">Document</typeparam> /// <param name="addedOrUpdatedDocuments">The documents to synchronize</param> /// <returns>the list of documents that could NOT be synced</returns> internal List <string> Execute <T>(List <T> addedOrUpdatedDocuments) where T : DocumentBase { var sw = new Stopwatch(); sw.Start(); Logger.Debug($"Got {addedOrUpdatedDocuments.Count} new/updated items!"); var failedIds = new List <string>(); var bulkAllObservable = _esClient.BulkAll(addedOrUpdatedDocuments, b => b.BufferToBulk((descriptor, buffer) => { foreach (T document in buffer) { if (document.Deleted) { descriptor.Delete <T>(doc => doc.Index(document.GetType().Name.ToLower()).Document(document)); Logger.Debug($"Item {document.Id} marked to be deleted!"); } else { descriptor.Index <T>(doc => doc.Index(document.GetType().Name.ToLower()).Document(document)); Logger.Debug($"Item {document.Id} marked to be upserted!"); } } }) .DroppedDocumentCallback((bulkResponseItem, document) => { Logger.Error($"Unable to index: {bulkResponseItem} {document}"); failedIds.Add(document.Id); }) .BackOffTime("1s") //how long to wait between retries .BackOffRetries(_appSettings.Get(BusinessConstants.ElasticSearchBulkSyncNoOfRetries, DefaultValues.ElasticSearchBulkSyncNoOfRetries)) //how many retries are attempted if a failure occurs .RefreshOnCompleted() //refresh the index after bulk insert .MaxDegreeOfParallelism(Environment.ProcessorCount) .ContinueAfterDroppedDocuments(true) .Size(_appSettings.Get(BusinessConstants.ElasticSearchSyncBatchSize, DefaultValues.ElasticSearchSyncBatchSize)));; var waitHandle = new ManualResetEvent(false); ExceptionDispatchInfo exceptionDispatchInfo = null; var observer = new BulkAllObserver( onNext: response => { Logger.Debug($"Written {response.Items.Count} in ES"); }, onError: exception => { exceptionDispatchInfo = ExceptionDispatchInfo.Capture(exception); waitHandle.Set(); }, onCompleted: () => waitHandle.Set()); bulkAllObservable.Subscribe(observer); //Subscribe to the observable, which will initiate the bulk indexing process waitHandle.WaitOne(TimeSpan.FromMinutes(1)); //Block the current thread until a signal is received exceptionDispatchInfo?.Throw(); //If an exception was captured during the bulk indexing process, throw it sw.Stop(); Logger.Debug("Finished in {ElapsedMilliseconds} ms", sw.ElapsedMilliseconds); return(failedIds); }