private static void CheckDns(IEnumerable<string> domains, RayCommandOptions rayCommandOptions) { var st = new Stopwatch(); st.Start(); var version = Guid.NewGuid().ToString().Substring(0, 6); var error = new List<string>(); var mx = new List<string>(); var badDomain = domains .Distinct() .AsParallel() .Where(domain => { if (!domain.HasValue()) return true; var client = new DnsClient(IPAddress.Parse("8.8.8.8"), 10000); var mxRecords = client.Resolve(domain, RecordType.Mx); if (mxRecords != null && (mxRecords.ReturnCode == ReturnCode.NoError || mxRecords.AnswerRecords.OfType<MxRecord>().Any())) { if (mxRecords.AnswerRecords.OfType<MxRecord>().Any()) mx.Add("The domain: " + domain + " has mx records: " + mxRecords.AnswerRecords.OfType<MxRecord>().Select(x => x.ExchangeDomainName).Commafy()); return false; } var retryCount = 0; while (retryCount < 2) { var aRecord = client.Resolve(domain, RecordType.A); if (aRecord == null) { error.Add("this domain produce null: " + domain); return true; } if (aRecord.ReturnCode == ReturnCode.ServerFailure) { retryCount++; continue; } if (aRecord.ReturnCode != ReturnCode.NoError) { error.Add(aRecord.ReturnCode + " dns error for: " + domain); return true; } if (aRecord.ReturnCode == ReturnCode.NoError) return false; return CanConnect(aRecord.AnswerRecords.OfType<ARecord>().First().Address, domain); } return true; }).ToList(); st.Stop(); WriteToConsole("Total shabank took: " + st.ElapsedMilliseconds); File.WriteAllLines(rayCommandOptions.OutputFile, badDomain.OrderBy(x => x)); File.WriteAllLines(rayCommandOptions.OutputFile + "." + version + ".clean.txt", domains.Except(badDomain).OrderBy(x => x)); File.WriteAllLines(rayCommandOptions.OutputFile + ".error.log." + version + ".txt", error.OrderBy(x => x).ToList()); File.WriteAllLines(rayCommandOptions.OutputFile + ".mx.txt", mx); }
private static void WriteCsv(RayCommandOptions rayCommandOptions, IEnumerable<OneRawContactsListCsvRow> newRows) { using (var textWriter = new StreamWriter(rayCommandOptions.OutputFile)) { var csvWriter = new CsvWriter(textWriter); csvWriter.WriteRecords(newRows); } }
private static void TopDomains(IEnumerable<OneRawContactsListCsvRow> rows, RayCommandOptions rayCommandOptions) { var domains = rows .Select(GetDomain) .GroupBy(x => x.ToLower()) .Select(x => new { x.Key, Count = x.Count() }) .OrderByDescending(x => x.Count) .ToList(); WriteToConsole("There are {0} groups", domains.Count); WriteToConsole("The top 10 domains are:"); WriteSaperator(); var topDomains = domains.Select(x => string.Format("Domain: {0} has: {1}", x.Key, x.Count)); File.WriteAllLines(rayCommandOptions.OutputFile, topDomains); WriteSaperator(); }
private static void OutputSmallDomains(List<OneRawContactsListCsvRow> rows, RayCommandOptions rayCommandOptions) { var st = new Stopwatch(); st.Start(); var removeDomains = GroupByDomain(rows) .Where(x => x.Count() > rayCommandOptions.MaximalCountOfContacts) .Select(x => x.Key) .Where(x => x.HasValue()) .ToList(); st.Stop(); WriteToConsole("Group by took {0} seconds", st.ElapsedMilliseconds / (long)1000); WriteToConsole("There are {0} domains to remove, they are {1}", removeDomains.Count, removeDomains.Commafy()); st.Reset(); st.Start(); var newRows = RemoveRowsByDomains(rows, removeDomains); st.Stop(); WriteToConsole("Removing domains took {0} ms", st.ElapsedMilliseconds); st.Reset(); st.Start(); WriteCsv(rayCommandOptions, newRows); st.Stop(); WriteToConsole("Writing the CSV took {0} ms", st.ElapsedMilliseconds); }
static void Main(string[] args) { var rayCommandOptions = new RayCommandOptions(); if (CommandLineParser.Default.ParseArguments(args, rayCommandOptions)) { if (!string.IsNullOrEmpty(rayCommandOptions.CsvFile)) { var st = new Stopwatch(); st.Start(); var csvSource = File.OpenRead(rayCommandOptions.CsvFile); var csvReader = new CsvReader(new StreamReader(csvSource)); var rows = csvReader.GetRecords<OneRawContactsListCsvRow>().ToList(); st.Stop(); WriteToConsole("There are {0} contacts, reading them took {1} seconds", rows.Count, st.ElapsedMilliseconds / 1000); st.Reset(); st.Start(); rows = rows.AsParallel().Distinct().ToList(); rows.ForEach(x => x.Email = x.Email.ToLower()); st.Stop(); WriteToConsole("Doing distinct took {0} seconds", st.ElapsedMilliseconds / 1000); WriteToConsole("We now have {0} contacts", rows.Count); if (rayCommandOptions.SaveDistinct) { WriteCsv(rayCommandOptions, rows); } if (rayCommandOptions.ListTopDomains) TopDomains(rows, rayCommandOptions); if (!string.IsNullOrEmpty(rayCommandOptions.EstimationParameters)) CalculateSendingTime(rows, rayCommandOptions.EstimationParameters); if (rayCommandOptions.MaximalCountOfContacts > 0) { OutputSmallDomains(rows, rayCommandOptions); } if (rayCommandOptions.ExtractDomains) { var domains = GroupByDomain(rows).Select(x => x.Key); File.WriteAllLines(rayCommandOptions.OutputFile, domains); } if (rayCommandOptions.BadDomainsFile.HasValue()) { var domains = File.ReadAllLines(rayCommandOptions.BadDomainsFile).ToList(); var newRows = RemoveRowsByDomains(rows, domains); WriteCsv(rayCommandOptions, newRows); } if (rayCommandOptions.RecordsFile.HasValue()) { var records = File.ReadAllLines(rayCommandOptions.RecordsFile).ToList(); var processedRecords = ConnectEmailsToRecords(records, rows); File.WriteAllLines(rayCommandOptions.OutputFile, processedRecords); } if (rayCommandOptions.NamesFile.HasValue()) { KeepEmailsWithGoodNames(rayCommandOptions, rows); } } if (rayCommandOptions.CheckDns.HasValue()) { var domains = File.ReadAllLines(rayCommandOptions.CheckDns); CheckDns(domains, rayCommandOptions); } } else { WriteToConsole("Parameter problem"); } }
private static void KeepEmailsWithGoodNames(RayCommandOptions rayCommandOptions, List<OneRawContactsListCsvRow> rows) { var names = File.ReadAllLines(rayCommandOptions.NamesFile).ToList(); var directory = Guid.NewGuid().ToString(); var simpleFsDirectory = new SimpleFSDirectory(new DirectoryInfo(directory)); if (!Directory.Exists(directory)) { var standardAnalyzer = new StandardAnalyzer(Version.LUCENE_30); var indexer = new IndexWriter(simpleFsDirectory, standardAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); var st = new Stopwatch(); st.Start(); var counter = 0; rows.ForEach(x => { var document = new Document(); document.Add(new Field("address", x.Email.Split('@')[0], Field.Store.YES, Field.Index.ANALYZED)); document.Add(new Field("collectionIndex", counter.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); indexer.AddDocument(document); counter++; }); indexer.Commit(); st.Stop(); WriteToConsole("Index took: " + st.ElapsedMilliseconds / 1000); } var reader = IndexReader.Open(simpleFsDirectory, true); var searcher = new IndexSearcher(reader); var st2 = new Stopwatch(); st2.Start(); Parallel.ForEach(names, x => { var ids = Search(searcher, "address", "*" + x + "*"); if (!ids.Any()) { // Console.WriteLine("for " + x + " there were no ids found."); return; } ids.ToList().ForEach(p => { rows[p].Mark = true; }); }); var emailsWithNames = rows.AsParallel().Where(x => x.Mark).ToList(); var emailsWithoutNames = rows.AsParallel().Where(x => !x.Mark).Select(x => x.Email).ToList(); st2.Stop(); WriteToConsole("Processing all names took: " + st2.ElapsedMilliseconds / 1000); File.WriteAllLines(rayCommandOptions.OutputFile + ".emails.without.names.txt", emailsWithoutNames); WriteCsv(rayCommandOptions, emailsWithNames); }