private static SimpleKMeans PrepareKMeansForArgument(int numberOfClusters, Instances instances) { var kMeans = new SimpleKMeans(); kMeans.setNumClusters(numberOfClusters); kMeans.buildClusterer(instances); return(kMeans); }
private static void UpdateClustersRangeListForArgument(Instances instances, SimpleKMeans kMeans, int i, IReadOnlyList <ArgumentClustersRanges> argumentsClustersRangeList) { for (var j = 0; j < instances.numInstances(); j++) { var n = kMeans.clusterInstance(instances.instance(j)); var value = instances.instance(j).value(_attributes[i]); if (value < argumentsClustersRangeList[i].ClusterRanges[n].From) { argumentsClustersRangeList[i].ClusterRanges[n].From = value; } if (value > argumentsClustersRangeList[i].ClusterRanges[n].To) { argumentsClustersRangeList[i].ClusterRanges[n].To = value; } } }
static void Main(string[] arg) { //Parse the businesses Dictionary<string, Business> businesses = new Dictionary<string, Business>(); StreamReader reader = new StreamReader(@"F:\Data Science Project\businesses.txt"); HashSet<string> set = new HashSet<string>(); string line = reader.ReadLine(); while (line != null) { if (line.Contains("\"type\": \"business\"")) { JObject obj = JObject.Parse(line); if (obj["categories"].ToString().Contains("Restaurants")) { JArray array = JArray.Parse(obj["categories"].ToString()); foreach (JValue val in array) { set.Add(val.ToString()); } Business business = new Business(); business.name = obj["name"].ToString(); business.full_address = obj["full_address"].ToString(); business.city = obj["city"].ToString(); business.state = obj["state"].ToString(); business.latitude = double.Parse(obj["latitude"].ToString()); business.longitude = double.Parse(obj["longitude"].ToString()); business.stars = float.Parse(obj["stars"].ToString()); business.review_count = int.Parse(obj["review_count"].ToString()); business.categories = obj["categories"].ToString(); businesses.Add(obj["business_id"].ToString(), business); } } line = reader.ReadLine(); } //Parse the reviews reader = new StreamReader(@"F:\Data Science Project\reviews.txt"); Dictionary<string, User> users = new Dictionary<string, User>(); line = reader.ReadLine(); while (line != null) { if (line.Contains("\"type\": \"review\"")) { JObject obj = JObject.Parse(line); if (businesses.ContainsKey(obj["business_id"].ToString()) && businesses[obj["business_id"].ToString()].categories.Contains("Restaurants")) { Review review = new Review(); if (users.ContainsKey(obj["user_id"].ToString())) { review.text = RemoveSpecialCharacters(obj["text"].ToString()); review.business_id = obj["business_id"].ToString(); review.stars = float.Parse(obj["stars"].ToString()); review.user_id = obj["user_id"].ToString(); users[obj["user_id"].ToString()].reviews.Add(review); } else { review.text = RemoveSpecialCharacters(obj["text"].ToString()); review.business_id = obj["business_id"].ToString(); review.stars = float.Parse(obj["stars"].ToString()); review.user_id = obj["user_id"].ToString(); User user = new User(); user.reviews.Add(review); users.Add(obj["user_id"].ToString(), user); } businesses[obj["business_id"].ToString()].reviews.Add(review); } } line = reader.ReadLine(); } //Parse the users reader = new StreamReader(@"F:\Data Science Project\users.txt"); line = reader.ReadLine(); while (line != null) { if (line.Contains("\"type\": \"user\"")) { JObject obj = JObject.Parse(line); if (users.ContainsKey(obj["user_id"].ToString())) { users[obj["user_id"].ToString()].name = obj["name"].ToString(); users[obj["user_id"].ToString()].user_id = obj["user_id"].ToString(); users[obj["user_id"].ToString()].average_stars = float.Parse(obj["average_stars"].ToString()); users[obj["user_id"].ToString()].review_count = int.Parse(obj["review_count"].ToString()); users[obj["user_id"].ToString()].funny_votes = int.Parse(obj["votes"]["funny"].ToString()); users[obj["user_id"].ToString()].useful_votes = int.Parse(obj["votes"]["useful"].ToString()); users[obj["user_id"].ToString()].cool_votes = int.Parse(obj["votes"]["cool"].ToString()); } } line = reader.ReadLine(); } int z = 0; foreach (KeyValuePair<string, Business> business in businesses) { z++; //Console.WriteLine(++z); //if (z == 10) break; //KeyValuePair<string, Business> business = new KeyValuePair<string, Business>("3vKhV2ELR2hmwlnoNqYWaA", businesses["3vKhV2ELR2hmwlnoNqYWaA"]); StringBuilder sb = new StringBuilder(); sb.AppendLine("@relation reviews"); sb.AppendLine("@attribute user_id string"); sb.AppendLine("@attribute text string"); sb.AppendLine("@attribute stars numeric"); sb.AppendLine("@data"); foreach (Review review in business.Value.reviews) { sb.Append(review.user_id + ",\"" + review.text + "\"" + "," + review.stars); sb.AppendLine(""); } Instances instances = new Instances(new java.io.StringReader(sb.ToString())); //Create nominal filter for the user_id attribute StringToNominal nominalFilter = new StringToNominal(); String[] options = weka.core.Utils.splitOptions("-R first"); nominalFilter.setOptions(options); nominalFilter.setInputFormat(instances); //apply the filter instances = weka.filters.Filter.useFilter(instances, nominalFilter); //Create string to word vector filter for the text attribute StringToWordVector stwFilter = new StringToWordVector(); options = weka.core.Utils.splitOptions("-R first-last -P att_ -W 1000 -prune-rate -1.0 -T -I -N 0 -L -stemmer weka.core.stemmers.NullStemmer -M 2 -tokenizer weka.core.tokenizers.WordTokenizer -delimiters \\r\\n\\t.,;:\\\'\\\"()?!\"\""); stwFilter.setOptions(options); stwFilter.setInputFormat(instances); //apply the filter instances = weka.filters.Filter.useFilter(instances, stwFilter); SimpleKMeans kmeansClusterer = new SimpleKMeans(); options = weka.core.Utils.splitOptions("-N " + Math.Ceiling(business.Value.reviews.Count / (decimal)5).ToString() + " -A \"weka.core.EuclideanDistance -R first-last\" -I 5 -S 10"); kmeansClusterer.setOptions(options); kmeansClusterer.setPreserveInstancesOrder(true); kmeansClusterer.buildClusterer(instances); int[] assignments = kmeansClusterer.getAssignments(); for (int j = 0; j < assignments.Length; j++) { for (int k = 0; k < assignments.Length; k++) { if (j != k && assignments[j] == assignments[k]) { users[business.Value.reviews[j].user_id].friends.Add(business.Value.reviews[k].user_id); } } } /*int i = 0; foreach (int clusterNum in assignments) { string str = ""; foreach (string u in users[business.Value.reviews[i].user_id].friends) { str += u + ","; } Console.WriteLine(clusterNum + "-->" + business.Value.reviews[i].user_id + "-->" + str); i++; }*/ /*EM emClusterer = new EM(); options = weka.core.Utils.splitOptions("-I 100 -N -1 -M 1.0E-6 -S 100"); emClusterer.setOptions(options); emClusterer.buildClusterer(instances); for (int i=0;i<business.Value.reviews.Count;i++) { Console.WriteLine(emClusterer.clusterInstance(instances.instance(i)) + "-->" + business.Value.reviews[i].text); }*/ //break; } XmlDocument xmlDoc = new XmlDocument(); XmlElement usersElt = xmlDoc.CreateElement("users"); xmlDoc.AppendChild(usersElt); foreach (KeyValuePair<string, User> user in users) { if (user.Value.friends.Count == 0) continue; if (string.IsNullOrEmpty(user.Value.name)) user.Value.name = "No Name"; XmlElement userElt = xmlDoc.CreateElement("user"); //id attribute XmlAttribute idAtt = xmlDoc.CreateAttribute("id"); idAtt.Value = user.Value.user_id; userElt.Attributes.Append(idAtt); //name XmlElement nameElt = xmlDoc.CreateElement("name"); nameElt.InnerText = user.Value.name; userElt.AppendChild(nameElt); //friends XmlElement friendsElt = xmlDoc.CreateElement("friends"); foreach (string user_id in user.Value.friends) { XmlElement friendElt = xmlDoc.CreateElement("friend"); friendElt.InnerText = user_id; friendsElt.AppendChild(friendElt); } userElt.AppendChild(friendsElt); //reviews XmlElement reviewsElt = xmlDoc.CreateElement("reviews"); foreach (Review review in user.Value.reviews) { XmlElement reviewElt = xmlDoc.CreateElement("review"); //text XmlElement textElt = xmlDoc.CreateElement("text"); textElt.InnerText = review.text; reviewElt.AppendChild(textElt); //lat XmlElement latElt = xmlDoc.CreateElement("lat"); latElt.InnerText = businesses[review.business_id].latitude.ToString(); reviewElt.AppendChild(latElt); //long XmlElement longElt = xmlDoc.CreateElement("long"); longElt.InnerText = businesses[review.business_id].longitude.ToString(); reviewElt.AppendChild(longElt); reviewsElt.AppendChild(reviewElt); //break; } userElt.AppendChild(reviewsElt); usersElt.AppendChild(userElt); } StreamWriter userlistwriter = new StreamWriter(@"F:\Data Science Project\users\userlist.txt"); userlistwriter.Write("<users>"); XmlNodeList usersList = xmlDoc.DocumentElement.SelectNodes("//user"); foreach (XmlNode user in usersList) { if (string.IsNullOrEmpty(user.Attributes["id"].InnerText)) continue; StreamWriter writer = new StreamWriter(@"F:\Data Science Project\users\" + user.Attributes["id"].InnerText + ".txt"); writer.Write(user.OuterXml); writer.Close(); userlistwriter.Write("<user id=\"" + user.Attributes["id"].InnerText + "\"><name>" + user.SelectSingleNode("name").InnerText + "</name></user>"); } userlistwriter.Write("</users>"); userlistwriter.Close(); }
/// <summary> /// display a GUI and generate the WEKA based clusterer /// </summary> /// <param name="InstancesList">list of the weka instance</param> /// <returns>weka clusterer</returns> public Clusterer BuildClusterer(cParamAlgo ClusteringAlgo, cExtendedTable Input) { this.InputTable = Input; foreach (var item in Input) { this.ListDescriptors.Add(item.Name); } cListValuesParam Parameters = ClusteringAlgo.GetListValuesParam(); Clusterer ClustererToReturn = null; Instances ListInstancesWithoutClasses = CreateInstancesWithoutClass(Input); #region EM if (ClusteringAlgo.Name == "EM") { ClustererToReturn = new EM(); if (Parameters.ListCheckValues.Get("checkBoxAutomatedClassNum").Value) ((EM)ClustererToReturn).setNumClusters(-1); else ((EM)ClustererToReturn).setNumClusters((int)Parameters.ListDoubleValues.Get("numericUpDownNumClasses").Value); ((EM)ClustererToReturn).setMaxIterations((int)Parameters.ListDoubleValues.Get("numericUpDownMaxIterations").Value); ((EM)ClustererToReturn).setMinStdDev((double)Parameters.ListDoubleValues.Get("numericUpDownMinStdev").Value); ((EM)ClustererToReturn).setSeed((int)Parameters.ListDoubleValues.Get("numericUpDownSeedNumber").Value); ClustererToReturn.buildClusterer(ListInstancesWithoutClasses); this.NumberOfClusters = ClustererToReturn.numberOfClusters(); } #endregion #region K Means else if (ClusteringAlgo.Name == "K-Means") { ClustererToReturn = new SimpleKMeans(); ((SimpleKMeans)ClustererToReturn).setNumClusters((int)Parameters.ListDoubleValues.Get("numericUpDownNumClasses").Value); ((SimpleKMeans)ClustererToReturn).setSeed((int)Parameters.ListDoubleValues.Get("numericUpDownSeedNumber").Value); string DistanceType = (string)Parameters.ListTextValues.Get("comboBoxDistance").Value; if (DistanceType == "Euclidean") { EuclideanDistance ED = new EuclideanDistance(); ED.setDontNormalize(!(bool)Parameters.ListCheckValues.Get("checkBoxNormalize").Value); ((SimpleKMeans)ClustererToReturn).setDistanceFunction(ED); } else if (DistanceType == "Manhattan") { ManhattanDistance MD = new ManhattanDistance(); MD.setDontNormalize(!(bool)Parameters.ListCheckValues.Get("checkBoxNormalize").Value); ((SimpleKMeans)ClustererToReturn).setDistanceFunction(MD); } else return null; ClustererToReturn.buildClusterer(ListInstancesWithoutClasses); this.NumberOfClusters = ClustererToReturn.numberOfClusters(); } #endregion //#region K Means++ //else if (ClusteringAlgo.Name == "K-Means++") //{ // ClustererToReturn = new SimpleKMeans(); // ((SimpleKMeans)ClustererToReturn).setNumClusters((int)Parameters.ListDoubleValues.Get("numericUpDownNumClasses").Value); // ((SimpleKMeans)ClustererToReturn).setSeed((int)Parameters.ListDoubleValues.Get("numericUpDownSeedNumber").Value); // string DistanceType = (string)Parameters.ListTextValues.Get("comboBoxDistance").Value; // if (DistanceType == "Euclidean") // { // EuclideanDistance ED = new EuclideanDistance(); // ED.setDontNormalize(!(bool)Parameters.ListCheckValues.Get("checkBoxNormalize").Value); // ((SimpleKMeans)ClustererToReturn).setDistanceFunction(ED); // } // else if (DistanceType == "Manhattan") // { // ManhattanDistance MD = new ManhattanDistance(); // MD.setDontNormalize(!(bool)Parameters.ListCheckValues.Get("checkBoxNormalize").Value); // ((SimpleKMeans)ClustererToReturn).setDistanceFunction(MD); // } // else return null; // ClustererToReturn.buildClusterer(ListInstancesWithoutClasses); // this.NumberOfClusters = ClustererToReturn.numberOfClusters(); //} //#endregion #region hierarchical else if (ClusteringAlgo.Name == "Hierarchical") { ClustererToReturn = new weka.clusterers.HierarchicalClusterer(); string OptionDistance = " -N " + (int)Parameters.ListDoubleValues.Get("numericUpDownNumClasses").Value; string DistanceType = (string)Parameters.ListTextValues.Get("comboBoxDistance").Value; OptionDistance += " -A \"weka.core."; switch (DistanceType) { case "Euclidean": OptionDistance += "EuclideanDistance"; break; case "Manhattan": OptionDistance += "ManhattanDistance"; break; case "Chebyshev": OptionDistance += "ChebyshevDistance"; break; default: break; } if (!(bool)Parameters.ListCheckValues.Get("checkBoxNormalize").Value) OptionDistance += " -D"; OptionDistance += " -R "; OptionDistance += "first-last\""; string WekaOption = "-L " + (string)Parameters.ListTextValues.Get("comboBoxLinkType").Value + OptionDistance; ((HierarchicalClusterer)ClustererToReturn).setOptions(weka.core.Utils.splitOptions(WekaOption)); ClustererToReturn.buildClusterer(ListInstancesWithoutClasses); this.NumberOfClusters = ClustererToReturn.numberOfClusters(); } #endregion #region Farthest First else if (ClusteringAlgo.Name == "FarthestFirst") { ClustererToReturn = new weka.clusterers.FarthestFirst(); ((FarthestFirst)ClustererToReturn).setNumClusters((int)Parameters.ListDoubleValues.Get("numericUpDownNumClasses").Value); ((FarthestFirst)ClustererToReturn).setSeed((int)Parameters.ListDoubleValues.Get("numericUpDownSeedNumber").Value); ClustererToReturn.buildClusterer(ListInstancesWithoutClasses); this.NumberOfClusters = ClustererToReturn.numberOfClusters(); } #endregion #region CobWeb else if (ClusteringAlgo.Name == "CobWeb") { ClustererToReturn = new weka.clusterers.Cobweb(); ((Cobweb)ClustererToReturn).setSeed((int)Parameters.ListDoubleValues.Get("numericUpDownSeedNumber").Value); ((Cobweb)ClustererToReturn).setAcuity((double)Parameters.ListDoubleValues.Get("numericUpDownAcuity").Value); ((Cobweb)ClustererToReturn).setCutoff((double)Parameters.ListDoubleValues.Get("numericUpDownCutOff").Value); ClustererToReturn.buildClusterer(ListInstancesWithoutClasses); this.NumberOfClusters = ClustererToReturn.numberOfClusters(); } #endregion #region Manual else if (ClusteringAlgo.Name == "Manual") { string DescriptorName = (string)Parameters.ListTextValues.Get("comboBoxForDescriptorManualClustering").Value; // this.Classes = new double[ListInstancesWithoutClasses.numInstances()]; for (int IdxPt = 0; IdxPt < this.Classes.Count / 2; IdxPt++) { this.Classes[IdxPt] = 2; } this.NumberOfClusters = 2; // break; //int IdxDesc = -1; //foreach (string item in this.ListDescriptors) //{ // IdxDesc++; // if (item == DescriptorName) break; //} //int Idx=0; //foreach (Instance item in ListInstancesWithoutClasses) //{ // this.Classes.Add(((int)item.value(IdxDesc)) % cGlobalInfo.ListCellularPhenotypes.Count); //} //// re - ordonner les valeurs du discripteur afin que les classes se suivent sans laisser de classe vide !! //this.NumberOfClusters = cGlobalInfo.ListCellularPhenotypes.Count; } #endregion else { System.Windows.Forms.MessageBox.Show("Clustering method not implemented !", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); return null; } return ClustererToReturn; }