コード例 #1
0
        /// <summary>
        /// Finds common substrings for each position in the texts of the specified column.
        /// It uses a batch approach to query for several positions (specified using SubstringQueryColumnCount)
        /// using a single query.
        /// </summary>
        private static async Task <SubstringsData> ExploreSubstrings(
            DConnection conn,
            ExplorerContext ctx,
            int substringQueryColumnCount,
            params int[] substringLengths)
        {
            var substrings = new SubstringsData();

            foreach (var length in substringLengths)
            {
                var hasRows = true;
                for (var pos = 0; hasRows; pos += substringQueryColumnCount)
                {
                    var query      = new TextColumnSubstring(ctx.Table, ctx.Column, pos, length, substringQueryColumnCount);
                    var sstrResult = await conn.Exec(query);

                    hasRows = false;
                    foreach (var row in sstrResult.Rows)
                    {
                        if (row.HasValue)
                        {
                            hasRows = true;
                            substrings.Add(pos + row.Index, row.Value, row.Count);
                        }
                    }
                }
            }
            return(substrings);
        }
コード例 #2
0
        private static string GenerateEmail(
            SubstringsData substrings,
            SubstringWithCountList domains,
            SubstringWithCountList tlds,
            Random rand,
            int domainsCountThreshold)
        {
            // create local-part section
            var str       = GenerateString(substrings, minLength: 6, rand);
            var allParts  = str.Split('@', StringSplitOptions.RemoveEmptyEntries);
            var sb        = new StringBuilder();
            var partIndex = 0;
            var pnext     = 1;

            while (partIndex < allParts.Length && rand.NextDouble() <= pnext)
            {
                sb.Append(allParts[partIndex]);
                pnext /= 2;
                partIndex++;
            }
            var localParts = sb.ToString()
                             .Split('.', StringSplitOptions.RemoveEmptyEntries)
                             .Where(s => s.Length == 1 || s.Length > 3)
                             .Take(rand.Next(1, 3));
            var localPart = string.Join('.', localParts);

            if (string.IsNullOrEmpty(localPart))
            {
                return(string.Empty);
            }
            if (domains.Count >= domainsCountThreshold)
            {
                // if the number of distinct domains is big enough we select one from the extracted list
                return(localPart + domains.GetRandomValue(rand, @default: string.Empty));
            }

            // create domain section
            sb.Clear();
            while (partIndex < allParts.Length)
            {
                sb.Append(allParts[partIndex]);
                partIndex++;
            }
            var domainParts = sb.ToString()
                              .Split('.', StringSplitOptions.RemoveEmptyEntries)
                              .Where(p => p.Length > 3);
            var domain = rand.NextDouble() > 0.15 ?
                         domainParts.Aggregate(string.Empty, (max, cur) => max.Length > cur.Length ? max : cur) :
                         string.Join('.', domainParts);

            if (string.IsNullOrEmpty(domain) || domain.Length < 4)
            {
                return(string.Empty);
            }
            return(localPart + "@" + domain + tlds.GetRandomValue(rand, @default: string.Empty));
        }
コード例 #3
0
        private static string GenerateString(SubstringsData substrings, int minLength, Random rand)
        {
            var sb  = new StringBuilder();
            var len = rand.Next(minLength, substrings.Count);

            for (var pos = 0; pos < substrings.Count && sb.Length < len; pos++)
            {
                var str = substrings.GetRandomSubstring(pos, rand);
                sb.Append(str);
                pos += str.Length;
            }
            return(sb.ToString());
        }
コード例 #4
0
        private static IEnumerable <string> GenerateEmails(
            SubstringsData substrings,
            SubstringWithCountList domains,
            SubstringWithCountList tlds,
            LengthDistribution lengthDistribution,
            SampleValuesGeneratorConfig.Result config)
        {
            var rand = new Random(Environment.TickCount);

            return(Enumerable.Range(0, 100 * config.SamplesToPublish)
                   .Select(_ => GenerateEmail(substrings, domains, tlds, lengthDistribution, config, rand))
                   .Where(email => !string.IsNullOrEmpty(email))
                   .Take(config.SamplesToPublish));
        }
コード例 #5
0
        private static string GenerateString(
            SubstringsData substrings,
            LengthDistribution lengthDistribution,
            int minLength,
            Random rand)
        {
            var sb  = new StringBuilder();
            var len = Math.Max(minLength, lengthDistribution.GetRandomValue(rand));

            for (var pos = 0; pos < substrings.Count && sb.Length < len; pos++)
            {
                var str = substrings.GetRandomSubstring(pos, rand) ?? "*";
                sb.Append(str);
                pos += str.Length;
            }
            var ret = sb.ToString();

            return(BannedWords.Contains(ret.ToUpperInvariant()) ? string.Empty : ret);
        }
コード例 #6
0
        private static IEnumerable <string> GenerateEmails(
            SubstringsData substrings,
            SubstringWithCountList domains,
            SubstringWithCountList tlds,
            int generatedValuesCount,
            int domainsCountThreshold)
        {
            var rand   = new Random(Environment.TickCount);
            var emails = new List <string>(generatedValuesCount);

            for (var i = 0; emails.Count < generatedValuesCount && i < generatedValuesCount * 100; i++)
            {
                var email = GenerateEmail(substrings, domains, tlds, rand, domainsCountThreshold);
                if (!string.IsNullOrEmpty(email))
                {
                    emails.Add(email);
                }
            }
            return(emails);
        }
コード例 #7
0
        private static string GenerateEmail(
            SubstringsData substrings,
            SubstringWithCountList domains,
            SubstringWithCountList tlds,
            LengthDistribution lengthDistribution,
            SampleValuesGeneratorConfig.Result config,
            Random rand)
        {
            // create local-part section
            var str = GenerateString(substrings, lengthDistribution, minLength: 6, rand);

            if (string.IsNullOrEmpty(str))
            {
                return(string.Empty);
            }
            var allParts  = str.Split('@', StringSplitOptions.RemoveEmptyEntries);
            var sb        = new StringBuilder();
            var partIndex = 0;
            var pnext     = 1;

            while (partIndex < allParts.Length && rand.NextDouble() <= pnext)
            {
                sb.Append(allParts[partIndex]);
                pnext /= 2;
                partIndex++;
            }
            var localParts = sb.ToString()
                             .Split('.', StringSplitOptions.RemoveEmptyEntries)
                             .Where(s => (s.Length == 1 || s.Length > 3) && !BannedWords.Contains(s.ToUpperInvariant()));
            var localPart = string.Join('.', localParts);

            if (string.IsNullOrEmpty(localPart))
            {
                return(string.Empty);
            }
            if (domains.TotalCount > config.MinValuesForCategoricalSampling)
            {
                // if the number of distinct domains is big enough we select one from the extracted list
                return(localPart + domains.GetRandomValue(rand));
            }

            // create domain section
            sb.Clear();
            while (partIndex < allParts.Length)
            {
                sb.Append(allParts[partIndex]);
                partIndex++;
            }
            var domainParts = sb.ToString()
                              .Split('.', StringSplitOptions.RemoveEmptyEntries)
                              .Where(p => p.Length > 3 && !BannedWords.Contains(p.ToUpperInvariant()));
            var domain = rand.NextDouble() > 0.15 ?
                         domainParts.Aggregate(string.Empty, (max, cur) => max.Length > cur.Length ? max : cur) :
                         string.Join('.', domainParts);

            if (string.IsNullOrEmpty(domain) || domain.Length < 4)
            {
                return(string.Empty);
            }
            return(localPart + "@" + domain + tlds.GetRandomValue(rand));
        }