public void Download(CrawlerWebsite website) { try { try { using var webClient = new WebClient(); website.Html = webClient.DownloadString(website.Url); } catch (WebException ex) { if (((HttpWebResponse)ex.Response).StatusCode == HttpStatusCode.NotFound) { website.ResponseCode = 404; } website.ResponseCode = 0; website.Html = ((HttpWebResponse)ex.Response).StatusDescription; } _logger.LogInformation($"Downloaded: {website.Url} with code {website.ResponseCode}"); } catch (Exception e) { _logger.LogError(e.Message); website.Crawled = false; website.CrawledTimestamp = null; throw; } }
public async Task Propose( ProposerThreadOptions?options ) { options ??= new ProposerThreadOptions(); options.Validate(); List <CsWordNounSpecification> wordsNounSpec; do { wordsNounSpec = _context.CsWordNounSpecifications .Include(e => e.Word) .Where( specification => specification.CrawlerProposedWiki != true && specification.DeclensionSg1 == true && specification.Word.CrawlerMeaningCheckProposed != true ) .OrderBy(r => Guid.NewGuid()) // Random order (warning: it is very slow operation!) .Take(options.StepCount) .ToList(); int counter = 0; foreach (var wordNounSpec in wordsNounSpec) { counter++; wordNounSpec.CrawlerProposedWiki = true; wordNounSpec.Word.CrawlerMeaningCheckProposed = true; wordNounSpec.Word.CrawlerMeaningCheckProposedTime = DateTime.Now; var crawlerWebsite = new CrawlerWebsite { Url = $"https://cs.wikipedia.org/wiki/{wordNounSpec.Word.Text.FirstCharToUpper()}", WordId = wordNounSpec.WordId }; await _entityManager.Persist(wordNounSpec); await _entityManager.Persist(crawlerWebsite); if (counter % options.FlushAfterCount == 0) { await _entityManager.Flush(); } } await _entityManager.Flush(); Thread.Sleep(options.SleepTime); } while (wordsNounSpec.Count > 0); }
private static Task Post( [NotNull] CrawlerWebsite webpage ) { throw new NotImplementedException(); }
public void Scrape(CrawlerWebsite website) { }