public void Load() { ServicePointManager.DefaultConnectionLimit = int.MaxValue; var parallelism = 10; var parallelismSetting = ConfigurationManager.AppSettings["parallelism"]; if (!string.IsNullOrEmpty(parallelismSetting)) { int.TryParse(parallelismSetting, out parallelism); } Console.WriteLine("Parellelism: {0}", parallelism); IEnumerable<int> serials; do { serials = GetSerials(); serials.AsParallel().WithDegreeOfParallelism(parallelism).ForAll(x => { try { var company = new Company() { Serial = x, }; var cvrNumber = CvrHelper.ToCvr(x); if (cvrNumber != -1) { company.Cvr = cvrNumber; var contentHelper = new ContentHelper(Encoding.GetEncoding("ISO-8859-1")); var document = contentHelper.GetContent(cvrNumber); if (!document.DocumentNode.OuterHtml.Contains("Virksomhedsnavnet eller CVR/SE-nummeret findes ikke på skattelisterne for selskaber 2011")) { var extractor = new DataExtractor(); extractor.Extract(company, document); Console.WriteLine("{0} Extracted {1} - {2}", company.Serial, company.Cvr, company.Name); } else { Console.WriteLine("{0} Disregarded {1}", company.Serial, company.Cvr); } } using (var context = new Context()) { context.Companies.Add(company); context.SaveChanges(); } } catch { // just leave that one for later } }); } while (serials.Any()); }
public void Export() { IEnumerable<Record> records; using (var context = new Context()) { records = context.Companies.Select(x => new Record { Name = x.Name, Cvr = x.Cvr, Type = x.Type, Legislation = x.Legislation, Profit = x.Revenue, Losses = x.Losses, Tax = x.TaxPaid, FossilProfit = x.FossilProfit, FossilLosses = x.FossilLosses, FossilTax = x.FossilTaxPaid, IsSubsidiary = x.IsSubsidiary, Subsidiaries = x.Subsidiaries, }).ToList(); } var colummnHeaders = new[] { "Name", "Cvr", "Type", "Legislation", "Profit", "Losses", "Tax", "FossilProfit", "FossilLosses", "FossilTax", "IsSubsidiary", "Subsidiaries" }; var engine = new FileHelperEngine<Record>() { HeaderText = string.Join(",", colummnHeaders), }; engine.WriteFile("out.csv", records); }
public void Analyze() { using (var context = new Context()) { Console.WriteLine("TAXES"); PrintDigitDistribution(context.Companies.Where(x => x.TaxPaid.HasValue && x.TaxPaid != 0).Select(x => x.TaxPaid.Value)); Console.WriteLine("LOSSES"); PrintDigitDistribution(context.Companies.Where(x => x.Losses.HasValue && x.Losses != 0).Select(x => x.Losses.Value)); Console.WriteLine("REVENUE"); PrintDigitDistribution(context.Companies.Where(x => x.Revenue.HasValue && x.Revenue != 0).Select(x => x.Revenue.Value)); } Console.WriteLine("Press the any key..."); Console.ReadKey(); }
public void RemoveDuplicates() { using (var context = new Context()) { var groups = from x in context.Companies group x by x.Cvr into g where g.Count() > 1 select g; foreach (var group in groups) { foreach (var company in group.Skip(1)) { context.Companies.Remove(company); } } context.SaveChanges(); } }
private static IEnumerable<int> GetSerials(int firstSerial = 1000000, int highestSerial = 9999999, int batchSize = 10000) { var count = highestSerial - firstSerial; var allSerials = Enumerable.Range(firstSerial, count); Console.WriteLine("Finding serials that have not yet been downloaded..."); var watch = Stopwatch.StartNew(); IEnumerable<int> knownSerials; using (var context = new Context()) { knownSerials = context.Companies.Select(x => x.Serial).ToList(); } var remainingSerials = allSerials.Except(knownSerials).Shuffle().Take(batchSize).ToList(); watch.Stop(); Console.WriteLine("Found {0} serials remaining in {1}ms", remainingSerials.Count(), watch.ElapsedMilliseconds); return remainingSerials; }