/// <summary> /// Finds common substrings for each position in the texts of the specified column. /// It uses a batch approach to query for several positions (specified using SubstringQueryColumnCount) /// using a single query. /// </summary> private static async Task <SubstringsData> ExploreSubstrings( DConnection conn, ExplorerContext ctx, int substringQueryColumnCount, params int[] substringLengths) { var substrings = new SubstringsData(); foreach (var length in substringLengths) { var hasRows = true; for (var pos = 0; hasRows; pos += substringQueryColumnCount) { var query = new TextColumnSubstring(ctx.Table, ctx.Column, pos, length, substringQueryColumnCount); var sstrResult = await conn.Exec(query); hasRows = false; foreach (var row in sstrResult.Rows) { if (row.HasValue) { hasRows = true; substrings.Add(pos + row.Index, row.Value, row.Count); } } } } return(substrings); }
private static string GenerateEmail( SubstringsData substrings, SubstringWithCountList domains, SubstringWithCountList tlds, Random rand, int domainsCountThreshold) { // create local-part section var str = GenerateString(substrings, minLength: 6, rand); var allParts = str.Split('@', StringSplitOptions.RemoveEmptyEntries); var sb = new StringBuilder(); var partIndex = 0; var pnext = 1; while (partIndex < allParts.Length && rand.NextDouble() <= pnext) { sb.Append(allParts[partIndex]); pnext /= 2; partIndex++; } var localParts = sb.ToString() .Split('.', StringSplitOptions.RemoveEmptyEntries) .Where(s => s.Length == 1 || s.Length > 3) .Take(rand.Next(1, 3)); var localPart = string.Join('.', localParts); if (string.IsNullOrEmpty(localPart)) { return(string.Empty); } if (domains.Count >= domainsCountThreshold) { // if the number of distinct domains is big enough we select one from the extracted list return(localPart + domains.GetRandomValue(rand, @default: string.Empty)); } // create domain section sb.Clear(); while (partIndex < allParts.Length) { sb.Append(allParts[partIndex]); partIndex++; } var domainParts = sb.ToString() .Split('.', StringSplitOptions.RemoveEmptyEntries) .Where(p => p.Length > 3); var domain = rand.NextDouble() > 0.15 ? domainParts.Aggregate(string.Empty, (max, cur) => max.Length > cur.Length ? max : cur) : string.Join('.', domainParts); if (string.IsNullOrEmpty(domain) || domain.Length < 4) { return(string.Empty); } return(localPart + "@" + domain + tlds.GetRandomValue(rand, @default: string.Empty)); }
private static string GenerateString(SubstringsData substrings, int minLength, Random rand) { var sb = new StringBuilder(); var len = rand.Next(minLength, substrings.Count); for (var pos = 0; pos < substrings.Count && sb.Length < len; pos++) { var str = substrings.GetRandomSubstring(pos, rand); sb.Append(str); pos += str.Length; } return(sb.ToString()); }
private static IEnumerable <string> GenerateEmails( SubstringsData substrings, SubstringWithCountList domains, SubstringWithCountList tlds, LengthDistribution lengthDistribution, SampleValuesGeneratorConfig.Result config) { var rand = new Random(Environment.TickCount); return(Enumerable.Range(0, 100 * config.SamplesToPublish) .Select(_ => GenerateEmail(substrings, domains, tlds, lengthDistribution, config, rand)) .Where(email => !string.IsNullOrEmpty(email)) .Take(config.SamplesToPublish)); }
private static string GenerateString( SubstringsData substrings, LengthDistribution lengthDistribution, int minLength, Random rand) { var sb = new StringBuilder(); var len = Math.Max(minLength, lengthDistribution.GetRandomValue(rand)); for (var pos = 0; pos < substrings.Count && sb.Length < len; pos++) { var str = substrings.GetRandomSubstring(pos, rand) ?? "*"; sb.Append(str); pos += str.Length; } var ret = sb.ToString(); return(BannedWords.Contains(ret.ToUpperInvariant()) ? string.Empty : ret); }
private static IEnumerable <string> GenerateEmails( SubstringsData substrings, SubstringWithCountList domains, SubstringWithCountList tlds, int generatedValuesCount, int domainsCountThreshold) { var rand = new Random(Environment.TickCount); var emails = new List <string>(generatedValuesCount); for (var i = 0; emails.Count < generatedValuesCount && i < generatedValuesCount * 100; i++) { var email = GenerateEmail(substrings, domains, tlds, rand, domainsCountThreshold); if (!string.IsNullOrEmpty(email)) { emails.Add(email); } } return(emails); }
private static string GenerateEmail( SubstringsData substrings, SubstringWithCountList domains, SubstringWithCountList tlds, LengthDistribution lengthDistribution, SampleValuesGeneratorConfig.Result config, Random rand) { // create local-part section var str = GenerateString(substrings, lengthDistribution, minLength: 6, rand); if (string.IsNullOrEmpty(str)) { return(string.Empty); } var allParts = str.Split('@', StringSplitOptions.RemoveEmptyEntries); var sb = new StringBuilder(); var partIndex = 0; var pnext = 1; while (partIndex < allParts.Length && rand.NextDouble() <= pnext) { sb.Append(allParts[partIndex]); pnext /= 2; partIndex++; } var localParts = sb.ToString() .Split('.', StringSplitOptions.RemoveEmptyEntries) .Where(s => (s.Length == 1 || s.Length > 3) && !BannedWords.Contains(s.ToUpperInvariant())); var localPart = string.Join('.', localParts); if (string.IsNullOrEmpty(localPart)) { return(string.Empty); } if (domains.TotalCount > config.MinValuesForCategoricalSampling) { // if the number of distinct domains is big enough we select one from the extracted list return(localPart + domains.GetRandomValue(rand)); } // create domain section sb.Clear(); while (partIndex < allParts.Length) { sb.Append(allParts[partIndex]); partIndex++; } var domainParts = sb.ToString() .Split('.', StringSplitOptions.RemoveEmptyEntries) .Where(p => p.Length > 3 && !BannedWords.Contains(p.ToUpperInvariant())); var domain = rand.NextDouble() > 0.15 ? domainParts.Aggregate(string.Empty, (max, cur) => max.Length > cur.Length ? max : cur) : string.Join('.', domainParts); if (string.IsNullOrEmpty(domain) || domain.Length < 4) { return(string.Empty); } return(localPart + "@" + domain + tlds.GetRandomValue(rand)); }