Example #1
0
        public void Load()
        {
            ServicePointManager.DefaultConnectionLimit = int.MaxValue;

            var parallelism = 10;
            var parallelismSetting = ConfigurationManager.AppSettings["parallelism"];
            if (!string.IsNullOrEmpty(parallelismSetting))
            {
                int.TryParse(parallelismSetting, out parallelism);
            }

            Console.WriteLine("Parellelism: {0}", parallelism);

            IEnumerable<int> serials;
            do
            {
                serials = GetSerials();
                serials.AsParallel().WithDegreeOfParallelism(parallelism).ForAll(x =>
                {
                    try
                    {
                        var company = new Company()
                        {
                            Serial = x,
                        };

                        var cvrNumber = CvrHelper.ToCvr(x);
                        if (cvrNumber != -1)
                        {
                            company.Cvr = cvrNumber;

                            var contentHelper = new ContentHelper(Encoding.GetEncoding("ISO-8859-1"));
                            var document = contentHelper.GetContent(cvrNumber);

                            if (!document.DocumentNode.OuterHtml.Contains("Virksomhedsnavnet eller CVR/SE-nummeret findes ikke på skattelisterne for selskaber 2011"))
                            {
                                var extractor = new DataExtractor();
                                extractor.Extract(company, document);
                                Console.WriteLine("{0} Extracted {1} - {2}", company.Serial, company.Cvr, company.Name);
                            }
                            else
                            {
                                Console.WriteLine("{0} Disregarded {1}", company.Serial, company.Cvr);
                            }
                        }

                        using (var context = new Context())
                        {
                            context.Companies.Add(company);
                            context.SaveChanges();
                        }
                    }
                    catch
                    {
                        // just leave that one for later
                    }
                });
            }
            while (serials.Any());
        }
Example #2
0
        public void Export()
        {
            IEnumerable<Record> records;
            using (var context = new Context())
            {
                records = context.Companies.Select(x => new Record {
                    Name = x.Name,
                    Cvr = x.Cvr,
                    Type = x.Type,
                    Legislation = x.Legislation,
                    Profit = x.Revenue,
                    Losses = x.Losses,
                    Tax = x.TaxPaid,
                    FossilProfit = x.FossilProfit,
                    FossilLosses = x.FossilLosses,
                    FossilTax = x.FossilTaxPaid,
                    IsSubsidiary = x.IsSubsidiary,
                    Subsidiaries = x.Subsidiaries,
                }).ToList();
            }

            var colummnHeaders = new[] { "Name", "Cvr", "Type", "Legislation", "Profit", "Losses", "Tax", "FossilProfit", "FossilLosses", "FossilTax", "IsSubsidiary", "Subsidiaries" };

            var engine = new FileHelperEngine<Record>()
            {
                HeaderText = string.Join(",", colummnHeaders),
            };
            engine.WriteFile("out.csv", records);
        }
Example #3
0
        public void Analyze()
        {
            using (var context = new Context())
            {
                Console.WriteLine("TAXES");
                PrintDigitDistribution(context.Companies.Where(x => x.TaxPaid.HasValue && x.TaxPaid != 0).Select(x => x.TaxPaid.Value));
                Console.WriteLine("LOSSES");
                PrintDigitDistribution(context.Companies.Where(x => x.Losses.HasValue && x.Losses != 0).Select(x => x.Losses.Value));
                Console.WriteLine("REVENUE");
                PrintDigitDistribution(context.Companies.Where(x => x.Revenue.HasValue && x.Revenue != 0).Select(x => x.Revenue.Value));
            }

            Console.WriteLine("Press the any key...");
            Console.ReadKey();
        }
Example #4
0
        public void RemoveDuplicates()
        {
            using (var context = new Context())
            {
                var groups = from x in context.Companies
                             group x by x.Cvr into g
                             where g.Count() > 1
                             select g;

                foreach (var group in groups)
                {
                    foreach (var company in group.Skip(1))
                    {
                        context.Companies.Remove(company);
                    }
                }

                context.SaveChanges();
            }
        }
Example #5
0
        private static IEnumerable<int> GetSerials(int firstSerial = 1000000, int highestSerial = 9999999, int batchSize = 10000)
        {
            var count = highestSerial - firstSerial;
            var allSerials = Enumerable.Range(firstSerial, count);

            Console.WriteLine("Finding serials that have not yet been downloaded...");
            var watch = Stopwatch.StartNew();

            IEnumerable<int> knownSerials;
            using (var context = new Context())
            {
                knownSerials = context.Companies.Select(x => x.Serial).ToList();
            }
            var remainingSerials = allSerials.Except(knownSerials).Shuffle().Take(batchSize).ToList();

            watch.Stop();
            Console.WriteLine("Found {0} serials remaining in {1}ms", remainingSerials.Count(), watch.ElapsedMilliseconds);

            return remainingSerials;
        }