public List <SubstancePreview> GetByQuery(QueryModel query) { var elements = _elements.GetAll().ToList(); var qToDb = _substances.GetAll(); var name = ""; qToDb = BuildQuery(qToDb, query, out name); var list = qToDb.Take(100).ToList(); var result = list.Select(x => new SubstancePreview { Name = x.Names.FirstOrDefault(n => n.Value.ToLower().Contains(name)).Value, Formula = x.Formula, Categories = GetCategoryList(x.Categories).OrderBy(c => c.Id).Select(c => c.Name).Distinct().ToArray(), Synonyms = x.Names.Select(n => n.Value).Take(12).ToArray(), Scheme = x.Scheme.Select(s => s.Value.HtmlDecode()).FirstOrDefault(), Id = x.Id }).ToList(); return(result); }
public override string Crawl() { Console.WriteLine("Start"); string ans = ""; var ward = new CriticalExCounter(); var t = new Stopwatch(); t.Start(); //var catList = new List<SpiderSubstanceCI>(); var addr = "http://www.chemindex.com/"; var initialRange = _substances.GetAll().ToDictionary(x => x.Id); var resultRange = new Dictionary <int, Substance>(); Console.WriteLine(String.Format("\nTime elapsed to dic: {0}\n", t.Elapsed)); int count = 0; int all = initialRange.Count(); foreach (var substance in initialRange) { System.Threading.Thread.Sleep(GetRnd()); ++count; if (count % 25 == 0) { Console.WriteLine(String.Format("\nElements: {0} of {1}. Time elapsed: {2}\n", count, all, t.Elapsed)); } if (!substance.Value.CAS.HasValue()) { continue; } var crawler = GetCrawler(); var uri = new Uri(addr + substance.Value.CAS + "-cas.html"); var cToken = new CancellationTokenSource(); try { crawler.CrawlBag.elements = new ConcurrentBag <SpiderSubstanceCI>(); var result = crawler.Crawl(uri, cToken); var element = (crawler.CrawlBag.elements as ConcurrentBag <SpiderSubstanceCI>).First(); //if (element.Name.Length == 0) // element.Name = "sas"; //element.CatId = i; //catList.Add(element); if (result.ErrorOccurred) { ans += String.Format("Crawl of {0} completed with error: {1}\n", result.RootUri.AbsoluteUri, result.ErrorException.Message); } substance.Value.BoilingPoint = element.BoilingPoint; substance.Value.Density = element.Density; substance.Value.FlashPoint = element.FlashPoint; substance.Value.HazardSymbols = element.HazardSymbols; substance.Value.MeltingPoint = element.MeltingPoint; substance.Value.RefractiveIndex = element.RefractiveIndex; substance.Value.VapourPressur = element.VapourPressur; substance.Value.WaterSolubility = element.WaterSolubility; resultRange.Add(substance.Key, substance.Value); ward.Tick(); } catch { ward.Bad(); if (ward.IsCritical()) { Console.WriteLine(String.Format("Time elapsed scan: {0}", t.Elapsed)); _substances.UpdateAll(resultRange); Console.WriteLine(String.Format("Time elapsed update: {0}", t.Elapsed)); } } } Console.WriteLine(String.Format("Time elapsed scan: {0}", t.Elapsed)); _substances.UpdateAll(resultRange); Console.WriteLine(String.Format("Time elapsed update: {0}", t.Elapsed)); //var trueCatList = new List<Category>(); //foreach (var item in catList) //{ // trueCatList.Add(new Category // { // Name = item.Name // }); //} //foreach (var item in catList) //{ // var cat = trueCatList.First(x => x.Name == item.Name); // if (item.Parents != null) // cat.Parents = trueCatList.Where(x => item.Parents.Contains(x.Name)).ToList(); //} //_categories.AddMany(trueCatList); return(null);// ans; }
public override string Crawl() { var timer = new Stopwatch(); timer.Start(); string ans = ""; var substSet = new SortedSet <SpiderSubstance>(); var addr = "http://easychem.org/ru/subst-ref/?cat0="; for (int i = 1; i < 129; ++i) { var crawler = GetCrawler(); var uri = new Uri(addr + i + "&pg=1"); var cToken = new CancellationTokenSource(); crawler.CrawlBag.elements = new ConcurrentBag <SpiderSubstance>(); var result = crawler.Crawl(uri, cToken); var elements = (crawler.CrawlBag.elements as ConcurrentBag <SpiderSubstance>); foreach (var item in elements) { substSet.Add(item); } if (result.ErrorOccurred) { ans += String.Format("Crawl of {0} completed with error: {1}\n", result.RootUri.AbsoluteUri, result.ErrorException.Message); } } Console.WriteLine(String.Format("Time elapsed : {0}, PARSED", timer.Elapsed.TotalMinutes)); var contextSet = new SortedSet <Substance>(_substances.GetAll()); Console.WriteLine(String.Format("Time elapsed : {0}, SET_CREATED", timer.Elapsed.TotalMinutes)); var trueSubstList = substSet.Select(x => { return(new Substance { CAS = x.CAS, Formula = x.BruttoFormula, Names = x.Names.Select(n => { return new SubstanceName(n); }).ToList(), Scheme = x.Formulas.Select(f => { return new SubstanceScheme(f); }).ToList(), Categories = x.Categories.Select(c => { return _categories.GetAll().FirstOrDefault(z => z.Name == c); }).Where(v => v != null).ToList() }); }); Console.WriteLine(String.Format("Time elapsed : {0}, TRUE_LIST_Q", timer.Elapsed.TotalMinutes)); var listToAdd = new List <Substance>(); foreach (var item in trueSubstList) { if (contextSet.Add(item)) { listToAdd.Add(item); } } Console.WriteLine(String.Format("Time elapsed : {0}, ALL_ADD", timer.Elapsed.TotalMinutes)); _substances.AddMany(listToAdd); Console.WriteLine(String.Format("Time elapsed : {0}, elements found: {1}", timer.Elapsed.TotalMinutes, listToAdd.Count())); return(ans); }