private static void SaveDatabase(string location, IpdbDatabase database) { JsonSerializer serializer = new JsonSerializer(); serializer.Converters.Add(new Newtonsoft.Json.Converters.StringEnumConverter()); serializer.NullValueHandling = NullValueHandling.Ignore; serializer.Formatting = Formatting.Indented; //serializer.Error += Serializer_Error; //Ignore errors using (StreamWriter sw = new StreamWriter(location, false)) using (JsonWriter writer = new JsonTextWriter(sw)) { serializer.Serialize(writer, database); } }
public IpdbDatabase ScrapeAllResume(IpdbDatabase database, string incrementalSaveLocation, int start = 1, int end = 10000) { if (database == null) { throw new Exception("You must resume from an existing database"); } Log.Information("{Scraper}: Beginning Scrape All. Start: {start} End: {end}...", _scraperName, start, end); var model = database; int maxThresholdOfNullsBeforeQuit = 300; int thresholdBeforeQuitCounter = 0; for (int i = start; i < end; i++) { var result = Scrape(i); if (result != null) { model.Data.Add(result); thresholdBeforeQuitCounter = 0; //Reset since we finally found a valid machine id } else { thresholdBeforeQuitCounter++; } if (thresholdBeforeQuitCounter > maxThresholdOfNullsBeforeQuit) { Log.Information("{Scraper}: Reached maximum threshold of invalid machine id's not returning results. Quiting...", _scraperName); break; //Reached to many invalid machines, quit. } if (i % 50 == 0 && !string.IsNullOrEmpty(incrementalSaveLocation)) //Every 50 entries save where we are at so we can resume if errors occur { Log.Information("{Scraper}: Reached incremental save threshold. Saving where we are at so far.", _scraperName); JsonSerializer serializer = new JsonSerializer(); serializer.Converters.Add(new Newtonsoft.Json.Converters.StringEnumConverter()); serializer.NullValueHandling = NullValueHandling.Ignore; serializer.Formatting = Formatting.Indented; //serializer.Error += Serializer_Error; //Ignore errors using (StreamWriter sw = new StreamWriter(incrementalSaveLocation, false)) using (JsonWriter writer = new JsonTextWriter(sw)) { serializer.Serialize(writer, model); } } } Log.Information("{Scraper}: Finished Scrape All. Start: {start} End: {end}...", _scraperName, start, end); return(model); }
static void Main(string[] args) { //var env = Environment.GetEnvironmentVariable("NETCORE_ENVIRONMENT"); var builder = new ConfigurationBuilder() .AddJsonFile($"appsettings.json", true, true) //.AddJsonFile($"appsettings.{env}.json", true, true) .AddEnvironmentVariables(); var config = builder.Build(); Log.Logger = new LoggerConfiguration() .WriteTo.File("Log\\Log-.txt", rollingInterval: RollingInterval.Day) .WriteTo.Console() .CreateLogger(); Log.Information("Starting IPDB Database Creator."); UserAgentStrings.InitializeList("Data\\UserAgentStrings.txt"); Log.Information("User Agent strings initialized."); var database = new IpdbDatabase(); var scraper = new IpdbScraper(); scraper.EnableRandomSleepTime = false; //Try do it as fast as possible. var cfg = config.Get <AppSettings>(); try { if (args.Any(p => p.Contains("-resume"))) { if (File.Exists(cfg.TempFileLocation)) { database = JsonConvert.DeserializeObject <IpdbDatabase>(File.ReadAllText(cfg.TempFileLocation)); //Find where we left off. var lastIpdb = database.Data.Max(c => c.IpdbId) + 1; database = scraper.ScrapeAllResume(database, cfg.TempFileLocation, lastIpdb, 10000); } else { Log.Information("Temp file not found: {file}", cfg.TempFileLocation); return; } SaveDatabase(cfg.FinalFileLocation, database); //Delete the temp file if (File.Exists(cfg.TempFileLocation)) { File.Delete(cfg.TempFileLocation); } } else //Full scraping { database = scraper.ScrapeAll(cfg.TempFileLocation); SaveDatabase(cfg.FinalFileLocation, database); //Delete the temp file if (File.Exists(cfg.TempFileLocation)) { File.Delete(cfg.TempFileLocation); } } } catch (Exception ex) { Log.Error("{error}", ex); throw ex; } Log.Information("Scraping Finished."); }