public IActionResult Partition2(int partitionCount) { using (var db = new FusekiContext()) { var articles = db.Articles.Where(el => el.Published == true) .Include(el => el.Tags).Take(20).ToList(); //get a tag vector for each article. var allTags = new HashSet <string>(); //TODO: What happens if we remove all tags which only occur once. foreach (var article in articles) { foreach (var tag in article.Tags) { allTags.Add(tag.Name); } } var newAllTags = new HashSet <string>(); foreach (var t in allTags) { var relatedArticles = db.Articles.Where(el => el.Tags.Select(tag => tag.Name).Contains(t)); if (relatedArticles.Count() > 1) { newAllTags.Add(t); } } allTags = newAllTags; var allTagsOrdered = allTags.OrderBy(el => el); var obs = new List <List <double> >(); var dict = new Dictionary <string, object>(); foreach (var article in articles) { var articleTags = article.Tags.Select(el => el.Name); var vector = new List <double>(); foreach (var tag in allTagsOrdered) { if (articleTags.Contains(tag)) { vector.Add(1); } else { vector.Add(0); } } obs.Add(vector); } var vecvec = obs.Select(el => el.ToArray()).ToArray(); var kmeans = new KMeans(k: partitionCount); var clusters = kmeans.Learn(vecvec); dict["Kmeans Error"] = kmeans.Error; dict["dimensionality"] = kmeans.Dimension; dict["Iterations"] = kmeans.Iterations; dict["MaxIterations"] = kmeans.MaxIterations; dict["Tolerance"] = kmeans.Tolerance; int[] labels = clusters.Decide(vecvec); //labels is array[articleId] => partitionNumber var ii = 0; var psets = new List <PartitionSet <Article> >(); //this is totally fake. TODO: refactor these to be dumber - no need to have comparators etc. var dm = new DistanceMetrics <Article>((a, b) => Comparators.GetTagCommonality(a, b), (a, b) => Comparators.ArticleKeyLookup(a, b)); while (ii < partitionCount) { //TODO: is accord zero indexed? psets.Add(new PartitionSet <Article>(dm, ii)); ii++; } var index = 0; foreach (var l in labels) { var article = articles[index]; index++; psets[l].Add(article); } var partitiondata = new PartitionData <Article>(psets, dict); var model = new ArticlePartitionModel(partitiondata); return(View("ArticlePartitions", model)); } }
///obviously impossible to test every possibility. ///Might be possible to iterate or do local hill climbing? order matters for all that. ///title length has weird transitivity; don't use that because it won't apply for other metrics. ///I could just do it 100 times with random starts, iterating adding members to the set which they are closest to. ///Then as a final step test each element to see if it belongs better in another set. public PartitionData <T> GetPartitions(int partitionCount, List <T> Elements) { if (partitionCount < 2 || partitionCount > 100) { throw new Exception("Are you sure you want to generate that many partitions?"); } var sets = new List <PartitionSet <T> >(); var ii = 0; while (ii < partitionCount) { var px = new PartitionSet <T>(metrics, ii); sets.Add(px); ii++; } using (var db = new FusekiContext()) { foreach (var el in Elements) { var targetSet = FindBestPartition(el, sets); targetSet.Add(el); } } //initial assignment done. //now iterate over each item, removing it and then readding it where it belongs til we reach stability or N iterations. var loopCt = 0; var moveCt = 100; var stats = new Dictionary <string, object>(); stats["InitialQuality"] = FindQuality(sets); while (loopCt < 200) { moveCt = 0; var PlannedMoves = new Dictionary <T, Tuple <PartitionSet <T>, PartitionSet <T> > >(); foreach (var set in sets) { foreach (var el in set.Items) { //remove it first so it has a free choice var targetSet = FindBestPartition(el, sets); if (targetSet != set) { moveCt++; var data = new Tuple <PartitionSet <T>, PartitionSet <T> >(set, targetSet); PlannedMoves[el] = data; } if (moveCt > 0) { break; } } if (moveCt > 0) { break; } } //problem: I am moving to favor the article, not to favor the overall quality of matches. i.e. if there is a linking article who is happier in a dedicated node, but removing him hurts the parent, how to do it? foreach (var article in PlannedMoves.Keys) { var tup = PlannedMoves[article]; var old = tup.Item1; var newset = tup.Item2; old.Remove(article); newset.Add(article); } loopCt++; stats[$"quality:{loopCt} moved:{moveCt}"] = FindQuality(sets); if (moveCt == 0) { break; } } stats["moveCt"] = moveCt; stats["loopCt"] = loopCt; stats["Final quality"] = FindQuality(sets); var pdata = new PartitionData <T>(sets, stats); return(pdata); }