コード例 #1
0
        public static IEnumerable <IGrouping <T, T> > GroupSimilar2 <T>(this IEnumerable <T> t, decimal MinSimilarity = GOOD) where T : ISimilarText, ISimilarImage, ISimilarLinks
        {
            var groups = new List <SimilarObjects <T> >();
            var items  = t.Select(x => new FlaggedObject <T>()
            {
                Object = x
            }).ToList();

            foreach (var item in items)
            {
                if (!item.Flagged)
                {
                    item.Flagged = true;
                    var matches = items.Where(x => !x.Flagged && x != item && x.Object.IsSimilar(item.Object));
                    if (matches.FirstOrDefault() != null)
                    {
                        var so = new SimilarObjects <T>()
                        {
                            Key = item.Object
                        };
                        foreach (var match in matches)
                        {
                            match.Flagged = true;
                            so.Add(match.Object);
                        }
                        groups.Add(so);
                    }
                    else
                    {
                        groups.Add(new SimilarObjects <T>()
                        {
                            Key = item.Object
                        });
                    }
                }
            }

            return(groups);
        }
コード例 #2
0
ファイル: GroupingUtility.cs プロジェクト: rickyn/postworthy
        public static IEnumerable <IGrouping <T, T> > GroupSimilar <T>(this IEnumerable <T> t, decimal MinSimilarity = GOOD, TextWriter log = null) where T : ISimilarText, ISimilarImage
        {
            #region Variable Definitions
            var                 input  = t.ToList();
            StringBuilder       update = new StringBuilder("");
            var                 groups = new List <SimilarObjects <T> >();
            decimal             si;
            int                 soLength = 0;
            int                 intersection;
            int                 p1c;
            int                 p2c;
            int                 union;
            SimilarObject <T>[] so = null;
            #endregion

            #region Process Data
            if (input != null && input.Count() > 0)
            {
                soLength = input.Count();

                if (log != null)
                {
                    log.WriteLine("{0}: [GroupSimilar] Initializing SimilarityObjects", DateTime.Now);
                }

                #region Initialize SimilarityObjects
                so = new SimilarObject <T> [soLength];

                for (int i = 0; i < soLength; i++)
                {
                    so[i]                 = new SimilarObject <T>();
                    so[i].Object          = input[i];
                    so[i].SimilarityIndex = 0;
                }
                #endregion

                if (log != null)
                {
                    log.WriteLine("{0}: [GroupSimilar] Comparing {1} items", DateTime.Now, soLength);
                }

                #region Compare Objects
                for (int i = 0; i < soLength; i++)
                {
                    if (so[i].ParentObject == null) // Only Tweets that are not already assigned to a parent should be processed
                    {
                        for (int j = (i + 1); j < soLength; j++)
                        {
                            if (so[j].ParentObject == null) //If it already has a parent then no need to try to find it another parent
                            {
                                #region Compare All Pairs and Assign Similarity (Pass 1)
                                intersection = 0;
                                p1c          = so[i].Object.WordLetterPairHash.Length;
                                p2c          = so[j].Object.WordLetterPairHash.Length;
                                union        = p1c + p2c;

                                if (union != 0)
                                {
                                    for (int k = 0; k < p1c; k++)
                                    {
                                        for (int l = 0; l < p2c; l++)
                                        {
                                            if (so[i].Object.WordLetterPairHash[k] == so[j].Object.WordLetterPairHash[l])
                                            {
                                                intersection++;
                                                break;
                                            }
                                        }
                                    }
                                    si = (2.0M * intersection) / union;
                                    if (si >= MinSimilarity)
                                    {
                                        so[j].ParentObject    = so[i].Object;
                                        so[j].SimilarityIndex = si;
                                        so[j].PassNumber      = 1;
                                        break;
                                    }
                                }
                                #endregion
                                #region Compare Images and Assign Similarity (Pass 2)
                                if (so[i].Object.Image != null && so[j].Object.Image != null)
                                {
                                    var             etm       = new ExhaustiveTemplateMatching(0);
                                    TemplateMatch[] matchings = etm.ProcessImage(so[i].Object.Image, so[j].Object.Image);
                                    if (matchings[0].Similarity > 0.90)
                                    {
                                        so[j].ParentObject    = so[i].Object;
                                        so[j].SimilarityIndex = Convert.ToDecimal(matchings[0].Similarity);
                                        so[j].PassNumber      = 2;
                                        break;
                                    }
                                }
                                #endregion
                            }
                        }
                    }
                }
                #endregion

                if (log != null)
                {
                    log.WriteLine("{0}: [GroupSimilar] Compared {1} items and found {2} with similar text and {3} with similar images",
                                  DateTime.Now,
                                  soLength,
                                  so.Count(x => x.PassNumber == 1),
                                  so.Count(x => x.PassNumber == 2));
                }
            }
            #endregion

            #region Create Groups

            if (log != null)
            {
                log.WriteLine("{0}: [GroupSimilar] Creating Groups", DateTime.Now, soLength);
            }

            if (so != null && soLength > 0)
            {
                for (int i = 0; i < soLength; i++)
                {
                    if (so[i].ParentObject != null)
                    {
                        if (groups.Where(g => g.Key.Equals(so[i].ParentObject)).Count() == 0)
                        {
                            groups.Add(new SimilarObjects <T>()
                            {
                                Key = so[i].ParentObject
                            });
                        }

                        groups.FirstOrDefault(g => g.Key.Equals(so[i].ParentObject)).Add(so[i].Object);
                    }
                    else
                    {
                        var similarObjects = new SimilarObjects <T>()
                        {
                            Key = so[i].Object
                        };
                        similarObjects.Add(so[i].Object);
                        groups.Add(similarObjects);
                    }
                }
            }
            #endregion

            if (log != null)
            {
                log.WriteLine("{0}: [GroupSimilar] Returning Groups", DateTime.Now, soLength);
            }

            return(groups);
        }