예제 #1
0
        public async Task RunAsync(string[] args)
        {
            // {{ parameters:

            string repository = args[0];

            DateTime start = DateTime.Parse(args[1]);

            DateTime end = DateTime.Parse(args[2]);

            string targetCollection = args[3];

            // }}

            var distance = new LevenshteinDistance();

            var client = new MongoClient(_mongoconnectionStringProvider.GetConnectionString(repository));

            var database = client.GetDatabase(repository);

            var dup_detection = database.GetCollection<BsonDocument>(targetCollection);

            using (var connection = new System.Data.SqlClient.SqlConnection(_mssqlconnectionStringProvider.GetConnectionString()))
            {
                var threads = connection.Query(SqlQueryFactory.Instance.Get(QUERY_GET_THREAD_PROFILE),
                new
                {
                    repository = repository.ToUpper(),
                    start = start,
                    end = end
                })
                .Select(m => new { Title = m.Title, Id = m.Id })
                .ToList();

                for (int i = 0; i < threads.Count - 1; i++)
                {
                    for (int j = i + 1; j < threads.Count; j++)
                    {
                        var left = (threads[i].Title as string).ToLower();

                        var right = (threads[j].Title as string).ToLower();

                        var percentage = distance.LevenshteinDistancePercent(left, right) * 100;

                        // list all the percentage >= 50%
                        if (percentage >= 50m)
                        {
                            var md5 = Utils.ComputeStringPairMD5Hash(left, right);

                            var count = await dup_detection.Find("{_id: '" + md5 + "'}").CountAsync();

                            if (count == 0)
                            {
                                var dict = new Dictionary<string, object>()
                                {
                                    { "_id", md5 },
                                    { "left", new Dictionary<string, string> {
                                        { "thread_id", threads[i].Id as string },
                                        { "text", left as string}
                                    }},
                                    { "right", new Dictionary<string, string> {
                                        { "thread_id", threads[j].Id as string},
                                        { "text", right as string}
                                    }},
                                    { "percentage", (int)percentage }
                                };

                                var document = new BsonDocument(dict);

                                await dup_detection.InsertOneAsync(document);
                            }
                        }
                    }
                }
            }
        }
예제 #2
0
        public async Task RunAsync()
        {
            // {{ parameters:

            string repository = "uwp";

            DateTime start = DateTime.Parse("2015-9-1");

            DateTime end = DateTime.Parse("2015-10-1");

            // }}
            var distance = new LevenshteinDistance();

            string cs = ConfigurationManager.ConnectionStrings["DefaultConnection"].ConnectionString;

            IConnectionStringProvider mongoDBDataProvider =
                ConnectionStringProvider.CreateConnectionStringProvider(ConnectionStringProvider.ConnectionStringProviderType.MongoDBConnectionStringProvider);

            var client = new MongoClient(mongoDBDataProvider.GetConnectionString(repository));

            var database = client.GetDatabase(repository);

            var dup_detection = database.GetCollection<BsonDocument>("dup_detection");

            using (var connection = new System.Data.SqlClient.SqlConnection(cs))
            {
                var threads = connection.Query(SqlQueryFactory.Instance.Get("get_thread_profile"),
                new
                {
                    repository = repository.ToUpper(),
                    start = start,
                    end = end
                })
                .Select(m => new { Title = m.Title, Id = m.Id })
                .ToList();

               for (int i = 0; i < threads.Count - 1; i++)
               {
                    for(int j = i + 1; j < threads.Count; j++)
                    {
                        var left = (threads[i].Title as string).ToLower();

                        var right = (threads[j].Title as string).ToLower();

                        var percentage = distance.LevenshteinDistancePercent(left, right) * 100;

                        // list all the percentage >= 50%
                        if(percentage >= 50m)
                        {
                            var md5 = Utils.ComputeStringPairMD5Hash(left, right);

                            var count = await dup_detection.Find("{_id: '" + md5 + "'}").CountAsync();

                            if (count == 0)
                            {
                                var dict = new Dictionary<string, object>()
                                {
                                    { "_id", md5 },
                                    { "left", new Dictionary<string, string> {
                                        { "thread_id", threads[i].Id as string },
                                        { "text", left as string}
                                    }},
                                    { "right", new Dictionary<string, string> {
                                        { "thread_id", threads[j].Id as string},
                                        { "text", right as string}
                                    }},
                                    { "percentage", (int)percentage }
                                };

                                var document = new BsonDocument(dict);

                                await dup_detection.InsertOneAsync(document);
                            }
                        }
                    }

                    Console.Write(".");
                }
            }
        }