Example #1
0
        private static void CheckDns(IEnumerable<string> domains, RayCommandOptions rayCommandOptions)
        {
            var st = new Stopwatch();
            st.Start();
            var version = Guid.NewGuid().ToString().Substring(0, 6);

            var error = new List<string>();
            var mx = new List<string>();

            var badDomain = domains
                .Distinct()
                .AsParallel()
                .Where(domain =>
                    {
                        if (!domain.HasValue())
                            return true;

                        var client = new DnsClient(IPAddress.Parse("8.8.8.8"), 10000);

                        var mxRecords = client.Resolve(domain, RecordType.Mx);
                        if (mxRecords != null && (mxRecords.ReturnCode == ReturnCode.NoError || mxRecords.AnswerRecords.OfType<MxRecord>().Any()))
                        {
                            if (mxRecords.AnswerRecords.OfType<MxRecord>().Any())
                                mx.Add("The domain: " + domain + " has mx records: " + mxRecords.AnswerRecords.OfType<MxRecord>().Select(x => x.ExchangeDomainName).Commafy());

                            return false;
                        }

                        var retryCount = 0;

                        while (retryCount < 2)
                        {

                            var aRecord = client.Resolve(domain, RecordType.A);

                            if (aRecord == null)
                            {
                                error.Add("this domain produce null: " + domain);
                                return true;
                            }

                            if (aRecord.ReturnCode == ReturnCode.ServerFailure)
                            {
                                retryCount++;
                                continue;
                            }

                            if (aRecord.ReturnCode != ReturnCode.NoError)
                            {
                                error.Add(aRecord.ReturnCode + " dns error for: " + domain);
                                return true;
                            }

                            if (aRecord.ReturnCode == ReturnCode.NoError)
                                return false;

                            return CanConnect(aRecord.AnswerRecords.OfType<ARecord>().First().Address, domain);
                        }

                        return true;

                    }).ToList();

            st.Stop();

            WriteToConsole("Total shabank took: " + st.ElapsedMilliseconds);

            File.WriteAllLines(rayCommandOptions.OutputFile, badDomain.OrderBy(x => x));
            File.WriteAllLines(rayCommandOptions.OutputFile + "." + version + ".clean.txt", domains.Except(badDomain).OrderBy(x => x));
            File.WriteAllLines(rayCommandOptions.OutputFile + ".error.log." + version + ".txt", error.OrderBy(x => x).ToList());
            File.WriteAllLines(rayCommandOptions.OutputFile + ".mx.txt", mx);
        }
Example #2
0
 private static void WriteCsv(RayCommandOptions rayCommandOptions, IEnumerable<OneRawContactsListCsvRow> newRows)
 {
     using (var textWriter = new StreamWriter(rayCommandOptions.OutputFile))
     {
         var csvWriter = new CsvWriter(textWriter);
         csvWriter.WriteRecords(newRows);
     }
 }
Example #3
0
        private static void TopDomains(IEnumerable<OneRawContactsListCsvRow> rows, RayCommandOptions rayCommandOptions)
        {
            var domains = rows
                .Select(GetDomain)
                .GroupBy(x => x.ToLower())
                .Select(x => new { x.Key, Count = x.Count() })
                .OrderByDescending(x => x.Count)
                .ToList();

            WriteToConsole("There are {0} groups", domains.Count);
            WriteToConsole("The top 10 domains are:");
            WriteSaperator();

            var topDomains = domains.Select(x => string.Format("Domain: {0} has: {1}", x.Key, x.Count));
            File.WriteAllLines(rayCommandOptions.OutputFile, topDomains);

            WriteSaperator();
        }
Example #4
0
        private static void OutputSmallDomains(List<OneRawContactsListCsvRow> rows, RayCommandOptions rayCommandOptions)
        {
            var st = new Stopwatch();

            st.Start();
            var removeDomains = GroupByDomain(rows)
                .Where(x => x.Count() > rayCommandOptions.MaximalCountOfContacts)
                .Select(x => x.Key)
                .Where(x => x.HasValue())
                .ToList();

            st.Stop();

            WriteToConsole("Group by took {0} seconds", st.ElapsedMilliseconds / (long)1000);

            WriteToConsole("There are {0} domains to remove, they are {1}", removeDomains.Count, removeDomains.Commafy());

            st.Reset();
            st.Start();
            var newRows = RemoveRowsByDomains(rows, removeDomains);
            st.Stop();

            WriteToConsole("Removing domains took {0} ms", st.ElapsedMilliseconds);

            st.Reset();
            st.Start();
            WriteCsv(rayCommandOptions, newRows);
            st.Stop();

            WriteToConsole("Writing the CSV took {0} ms", st.ElapsedMilliseconds);
        }
Example #5
0
        static void Main(string[] args)
        {
            var rayCommandOptions = new RayCommandOptions();
            if (CommandLineParser.Default.ParseArguments(args, rayCommandOptions))
            {
                if (!string.IsNullOrEmpty(rayCommandOptions.CsvFile))
                {
                    var st = new Stopwatch();
                    st.Start();
                    var csvSource = File.OpenRead(rayCommandOptions.CsvFile);
                    var csvReader = new CsvReader(new StreamReader(csvSource));
                    var rows = csvReader.GetRecords<OneRawContactsListCsvRow>().ToList();
                    st.Stop();

                    WriteToConsole("There are {0} contacts, reading them took {1} seconds", rows.Count, st.ElapsedMilliseconds / 1000);

                    st.Reset();
                    st.Start();
                    rows = rows.AsParallel().Distinct().ToList();
                    rows.ForEach(x => x.Email = x.Email.ToLower());
                    st.Stop();

                    WriteToConsole("Doing distinct took {0} seconds", st.ElapsedMilliseconds / 1000);
                    WriteToConsole("We now have {0} contacts", rows.Count);

                    if (rayCommandOptions.SaveDistinct)
                    {
                        WriteCsv(rayCommandOptions, rows);
                    }

                    if (rayCommandOptions.ListTopDomains)
                        TopDomains(rows, rayCommandOptions);

                    if (!string.IsNullOrEmpty(rayCommandOptions.EstimationParameters))
                        CalculateSendingTime(rows, rayCommandOptions.EstimationParameters);

                    if (rayCommandOptions.MaximalCountOfContacts > 0)
                    {
                        OutputSmallDomains(rows, rayCommandOptions);
                    }

                    if (rayCommandOptions.ExtractDomains)
                    {
                        var domains = GroupByDomain(rows).Select(x => x.Key);
                        File.WriteAllLines(rayCommandOptions.OutputFile, domains);
                    }

                    if (rayCommandOptions.BadDomainsFile.HasValue())
                    {
                        var domains = File.ReadAllLines(rayCommandOptions.BadDomainsFile).ToList();
                        var newRows = RemoveRowsByDomains(rows, domains);

                        WriteCsv(rayCommandOptions, newRows);
                    }

                    if (rayCommandOptions.RecordsFile.HasValue())
                    {
                        var records = File.ReadAllLines(rayCommandOptions.RecordsFile).ToList();
                        var processedRecords = ConnectEmailsToRecords(records, rows);

                        File.WriteAllLines(rayCommandOptions.OutputFile, processedRecords);
                    }

                    if (rayCommandOptions.NamesFile.HasValue())
                    {
                        KeepEmailsWithGoodNames(rayCommandOptions, rows);
                    }
                }

                if (rayCommandOptions.CheckDns.HasValue())
                {
                    var domains = File.ReadAllLines(rayCommandOptions.CheckDns);
                    CheckDns(domains, rayCommandOptions);
                }

            }
            else
            {
                WriteToConsole("Parameter problem");
            }
        }
Example #6
0
        private static void KeepEmailsWithGoodNames(RayCommandOptions rayCommandOptions, List<OneRawContactsListCsvRow> rows)
        {
            var names = File.ReadAllLines(rayCommandOptions.NamesFile).ToList();

            var directory = Guid.NewGuid().ToString();

            var simpleFsDirectory = new SimpleFSDirectory(new DirectoryInfo(directory));

            if (!Directory.Exists(directory))
            {
                var standardAnalyzer = new StandardAnalyzer(Version.LUCENE_30);
                var indexer = new IndexWriter(simpleFsDirectory, standardAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);

                var st = new Stopwatch();
                st.Start();
                var counter = 0;
                rows.ForEach(x =>
                    {
                        var document = new Document();
                        document.Add(new Field("address", x.Email.Split('@')[0], Field.Store.YES, Field.Index.ANALYZED));
                        document.Add(new Field("collectionIndex", counter.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                        indexer.AddDocument(document);

                        counter++;
                    });

                indexer.Commit();

                st.Stop();

                WriteToConsole("Index took: " + st.ElapsedMilliseconds / 1000);
            }

            var reader = IndexReader.Open(simpleFsDirectory, true);
            var searcher = new IndexSearcher(reader);

            var st2 = new Stopwatch();
            st2.Start();

            Parallel.ForEach(names, x =>
                {
                    var ids = Search(searcher, "address", "*" + x + "*");

                    if (!ids.Any())
                    {
            //						Console.WriteLine("for " + x + " there were no ids found.");
                        return;
                    }

                    ids.ToList().ForEach(p => { rows[p].Mark = true; });
                });

            var emailsWithNames = rows.AsParallel().Where(x => x.Mark).ToList();
            var emailsWithoutNames = rows.AsParallel().Where(x => !x.Mark).Select(x => x.Email).ToList();

            st2.Stop();
            WriteToConsole("Processing all names took: " + st2.ElapsedMilliseconds / 1000);

            File.WriteAllLines(rayCommandOptions.OutputFile + ".emails.without.names.txt", emailsWithoutNames);

            WriteCsv(rayCommandOptions, emailsWithNames);
        }