public void EndTrainingSession() { Console.WriteLine("End"); stv = new StringToWordVector(); stv.setAttributeNamePrefix("#"); stv.setLowerCaseTokens(true); stv.setOutputWordCounts(true); stv.setInputFormat(oDataSet); stv.setStemmer(new weka.core.stemmers.LovinsStemmer()); stv.setIDFTransform(true); dataSet = Filter.useFilter(oDataSet, stv); MultiLabelInstances mli = new MultiLabelInstances(dataSet, loadLabelsMeta(dataSet, tagsNb)); BinaryRelevance br = new mulan.classifier.transformation.BinaryRelevance(new NaiveBayes()); lps = new mulan.classifier.meta.RAkEL(br); br.setDebug(true); lps.setDebug(true); lps.build(mli); }
private void RealizaTreinamentoWeka() { TempData["DataTreinamento"] = DateTime.Now; Instances dadosTreinamento = ImportaArquivosServidor(); // Aplica o filter StringToWordVector weka.filters.Filter[] filters = new weka.filters.Filter[2]; filters[0] = new StringToWordVector(); filters[1] = AttributeSelectionFilter(2); weka.filters.MultiFilter filter = new weka.filters.MultiFilter(); filter.setInputFormat(dadosTreinamento); filter.setFilters(filters); // Cria o Classificador a partir dos dados de treinamento FilteredClassifier classifier = new FilteredClassifier(); classifier.setFilter(filter); classifier.setClassifier(new NaiveBayes()); classifier.buildClassifier(dadosTreinamento); // Realiza um CrossValidation pra expôr as estatísticas do Classificador /* * Evaluation eval = new Evaluation(dadosTreinamento); * eval.crossValidateModel(classifier, dadosTreinamento, 10, new java.util.Random(1)); * * var EstatisticasTexto = * "Instâncias Corretas: \t" + eval.correct() + " (" + Math.Round(eval.pctCorrect(),2) + "%)" + System.Environment.NewLine + "Instâncias Incorretas: \t" + eval.incorrect() + " (" + Math.Round(eval.pctIncorrect(),2) + "%)" + System.Environment.NewLine + "Total de Instâncias: \t\t" + eval.numInstances(); + + + TempData["TreinamentoRealizado"] = EstatisticasTexto; */ // Salva o Classificador (model) como /Classificador/Classificador.model SerializationHelper.write(string.Format("{0}Classificador.model", DiretorioClassificadorServidor), classifier); // Salva os dados de treinamento como /Classificador/DadosTreinamento.arff Utilidades.SalvarArff(dadosTreinamento, DiretorioClassificadorServidor, "DadosTreinamento.arff"); }
/// <summary> /// Filtered Support Vector Machine Classification with type specified. i.e. BOF or BOW /// </summary> /// <param name="type"></param> private void FilteredSVM(string type, string trainingFilePath, string directoryName, TextFilterType textFilterType) { var currDir = System.Environment.GetFolderPath(System.Environment.SpecialFolder.ApplicationData); // Combine the base folder with your specific folder.... string specificFolder = System.IO.Path.Combine(currDir, "MARC 2.0"); // Check if folder exists and if not, create it if (!Directory.Exists(specificFolder)) { Directory.CreateDirectory(specificFolder); } try { var trainingDatatsetFilePath = ""; if (type == "BOF") { trainingDatatsetFilePath = specificFolder + "\\InputData\\TrainingDatasets\\BOF Dataset.arff"; } else { trainingDatatsetFilePath = specificFolder + "\\InputData\\TrainingDatasets\\BOW Dataset.arff"; } var testDatasetFilePath = specificFolder + "\\InputData\\TrainingDatasets\\Test.arff"; // If training file path is supplied then use it. if (trainingFilePath != null) { trainingDatatsetFilePath = trainingFilePath; } java.io.BufferedReader trainReader = new BufferedReader(new FileReader(trainingDatatsetFilePath)); //File with text examples BufferedReader classifyReader = new BufferedReader(new FileReader(testDatasetFilePath)); //File with text to classify Instances trainInsts = new Instances(trainReader); Instances classifyInsts = new Instances(classifyReader); trainInsts.setClassIndex(trainInsts.numAttributes() - 1); classifyInsts.setClassIndex(classifyInsts.numAttributes() - 1); FilteredClassifier model = new FilteredClassifier(); StringToWordVector stringtowordvector = new StringToWordVector(); stringtowordvector.setTFTransform(true); model.setFilter(new StringToWordVector()); weka.classifiers.Classifier smocls = new weka.classifiers.functions.SMO(); //smocls.setOptions(weka.core.Utils.splitOptions("-C 1.0 -L 0.001 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.Puk -C 250007 -O 1.0 -S 1.0\"")); smocls.setOptions(weka.core.Utils.splitOptions("-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\"")); model.setClassifier(smocls); bool exists; var directoryRoot = System.IO.Path.GetDirectoryName(Directory.GetCurrentDirectory()); directoryRoot = specificFolder; //Check if the model exists and if not then build a model switch (textFilterType) { case TextFilterType.NoFilter: exists = SVMNoFilterCheckifModelExists(trainingDatatsetFilePath); //if does not exists then build model and save it and save the file also for current filter if (!exists) { model.buildClassifier(trainInsts); Helper.Helper.WriteToBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMNoFilterModel.dat", model); string content = System.IO.File.ReadAllText(trainingDatatsetFilePath); using (var sW = new StreamWriter(directoryRoot + @"\Model\SVM\\SVMNoFilterFile.dat")) { sW.Write(content); } } // if exists then read the file and use the model else { model = Helper.Helper.ReadFromBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMNoFilterModel.dat"); } break; //Case Stopwords Removal case TextFilterType.StopwordsRemoval: exists = SVMSWRCheckifModelExists(trainingDatatsetFilePath); //if does not exists then build model and save it and save the file also for current filter if (!exists) { model.buildClassifier(trainInsts); Helper.Helper.WriteToBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMSWRFilterModel.dat", model); string content = System.IO.File.ReadAllText(trainingDatatsetFilePath); using (var sW = new StreamWriter(directoryRoot + @"\Model\SVM\\SVMSWRFile.dat")) { sW.Write(content); } } // if exists then read the file and use the model else { model = Helper.Helper.ReadFromBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMSWRFilterModel.dat"); } break; //Case Stemming case TextFilterType.Stemming: exists = SVMSTCheckifModelExists(trainingDatatsetFilePath); //if does not exists then build model and save it and save the file also for current filter if (!exists) { model.buildClassifier(trainInsts); Helper.Helper.WriteToBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMSTFilterModel.dat", model); string content = System.IO.File.ReadAllText(trainingDatatsetFilePath); using (var sW = new StreamWriter(directoryRoot + @"\Model\SVM\\SVMSTFile.dat")) { sW.Write(content); } } // if exists then read the file and use the model else { model = Helper.Helper.ReadFromBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMSTFilterModel.dat"); } break; //Case Stopwords Removal with Stemming case TextFilterType.StopwordsRemovalStemming: exists = SVMSWRSTCheckifModelExists(trainingDatatsetFilePath); //if does not exists then build model and save it and save the file also for current filter if (!exists) { model.buildClassifier(trainInsts); Helper.Helper.WriteToBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMSWRSTFilterModel.dat", model); string content = System.IO.File.ReadAllText(trainingDatatsetFilePath); using (var sW = new StreamWriter(directoryRoot + @"\Model\SVM\\SVMSWRSTFile.dat")) { sW.Write(content); } } // if exists then read the file and use the model else { model = Helper.Helper.ReadFromBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMSWRSTFilterModel.dat"); } break; default: break; } //model.buildClassifier(trainInsts); for (int i = 0; i < classifyInsts.numInstances(); i++) { classifyInsts.instance(i).setClassMissing(); double cls = model.classifyInstance(classifyInsts.instance(i)); classifyInsts.instance(i).setClassValue(cls); classification = cls == 0 ? "Bug Report" : cls == 1 ? "Feature Request" : "Other"; tempAllClassification.Add(classification); } AllClassification = tempAllClassification; } catch (Exception o) { error = o.ToString(); } }
static void Main(string[] arg) { //Parse the businesses Dictionary<string, Business> businesses = new Dictionary<string, Business>(); StreamReader reader = new StreamReader(@"F:\Data Science Project\businesses.txt"); HashSet<string> set = new HashSet<string>(); string line = reader.ReadLine(); while (line != null) { if (line.Contains("\"type\": \"business\"")) { JObject obj = JObject.Parse(line); if (obj["categories"].ToString().Contains("Restaurants")) { JArray array = JArray.Parse(obj["categories"].ToString()); foreach (JValue val in array) { set.Add(val.ToString()); } Business business = new Business(); business.name = obj["name"].ToString(); business.full_address = obj["full_address"].ToString(); business.city = obj["city"].ToString(); business.state = obj["state"].ToString(); business.latitude = double.Parse(obj["latitude"].ToString()); business.longitude = double.Parse(obj["longitude"].ToString()); business.stars = float.Parse(obj["stars"].ToString()); business.review_count = int.Parse(obj["review_count"].ToString()); business.categories = obj["categories"].ToString(); businesses.Add(obj["business_id"].ToString(), business); } } line = reader.ReadLine(); } //Parse the reviews reader = new StreamReader(@"F:\Data Science Project\reviews.txt"); Dictionary<string, User> users = new Dictionary<string, User>(); line = reader.ReadLine(); while (line != null) { if (line.Contains("\"type\": \"review\"")) { JObject obj = JObject.Parse(line); if (businesses.ContainsKey(obj["business_id"].ToString()) && businesses[obj["business_id"].ToString()].categories.Contains("Restaurants")) { Review review = new Review(); if (users.ContainsKey(obj["user_id"].ToString())) { review.text = RemoveSpecialCharacters(obj["text"].ToString()); review.business_id = obj["business_id"].ToString(); review.stars = float.Parse(obj["stars"].ToString()); review.user_id = obj["user_id"].ToString(); users[obj["user_id"].ToString()].reviews.Add(review); } else { review.text = RemoveSpecialCharacters(obj["text"].ToString()); review.business_id = obj["business_id"].ToString(); review.stars = float.Parse(obj["stars"].ToString()); review.user_id = obj["user_id"].ToString(); User user = new User(); user.reviews.Add(review); users.Add(obj["user_id"].ToString(), user); } businesses[obj["business_id"].ToString()].reviews.Add(review); } } line = reader.ReadLine(); } //Parse the users reader = new StreamReader(@"F:\Data Science Project\users.txt"); line = reader.ReadLine(); while (line != null) { if (line.Contains("\"type\": \"user\"")) { JObject obj = JObject.Parse(line); if (users.ContainsKey(obj["user_id"].ToString())) { users[obj["user_id"].ToString()].name = obj["name"].ToString(); users[obj["user_id"].ToString()].user_id = obj["user_id"].ToString(); users[obj["user_id"].ToString()].average_stars = float.Parse(obj["average_stars"].ToString()); users[obj["user_id"].ToString()].review_count = int.Parse(obj["review_count"].ToString()); users[obj["user_id"].ToString()].funny_votes = int.Parse(obj["votes"]["funny"].ToString()); users[obj["user_id"].ToString()].useful_votes = int.Parse(obj["votes"]["useful"].ToString()); users[obj["user_id"].ToString()].cool_votes = int.Parse(obj["votes"]["cool"].ToString()); } } line = reader.ReadLine(); } int z = 0; foreach (KeyValuePair<string, Business> business in businesses) { z++; //Console.WriteLine(++z); //if (z == 10) break; //KeyValuePair<string, Business> business = new KeyValuePair<string, Business>("3vKhV2ELR2hmwlnoNqYWaA", businesses["3vKhV2ELR2hmwlnoNqYWaA"]); StringBuilder sb = new StringBuilder(); sb.AppendLine("@relation reviews"); sb.AppendLine("@attribute user_id string"); sb.AppendLine("@attribute text string"); sb.AppendLine("@attribute stars numeric"); sb.AppendLine("@data"); foreach (Review review in business.Value.reviews) { sb.Append(review.user_id + ",\"" + review.text + "\"" + "," + review.stars); sb.AppendLine(""); } Instances instances = new Instances(new java.io.StringReader(sb.ToString())); //Create nominal filter for the user_id attribute StringToNominal nominalFilter = new StringToNominal(); String[] options = weka.core.Utils.splitOptions("-R first"); nominalFilter.setOptions(options); nominalFilter.setInputFormat(instances); //apply the filter instances = weka.filters.Filter.useFilter(instances, nominalFilter); //Create string to word vector filter for the text attribute StringToWordVector stwFilter = new StringToWordVector(); options = weka.core.Utils.splitOptions("-R first-last -P att_ -W 1000 -prune-rate -1.0 -T -I -N 0 -L -stemmer weka.core.stemmers.NullStemmer -M 2 -tokenizer weka.core.tokenizers.WordTokenizer -delimiters \\r\\n\\t.,;:\\\'\\\"()?!\"\""); stwFilter.setOptions(options); stwFilter.setInputFormat(instances); //apply the filter instances = weka.filters.Filter.useFilter(instances, stwFilter); SimpleKMeans kmeansClusterer = new SimpleKMeans(); options = weka.core.Utils.splitOptions("-N " + Math.Ceiling(business.Value.reviews.Count / (decimal)5).ToString() + " -A \"weka.core.EuclideanDistance -R first-last\" -I 5 -S 10"); kmeansClusterer.setOptions(options); kmeansClusterer.setPreserveInstancesOrder(true); kmeansClusterer.buildClusterer(instances); int[] assignments = kmeansClusterer.getAssignments(); for (int j = 0; j < assignments.Length; j++) { for (int k = 0; k < assignments.Length; k++) { if (j != k && assignments[j] == assignments[k]) { users[business.Value.reviews[j].user_id].friends.Add(business.Value.reviews[k].user_id); } } } /*int i = 0; foreach (int clusterNum in assignments) { string str = ""; foreach (string u in users[business.Value.reviews[i].user_id].friends) { str += u + ","; } Console.WriteLine(clusterNum + "-->" + business.Value.reviews[i].user_id + "-->" + str); i++; }*/ /*EM emClusterer = new EM(); options = weka.core.Utils.splitOptions("-I 100 -N -1 -M 1.0E-6 -S 100"); emClusterer.setOptions(options); emClusterer.buildClusterer(instances); for (int i=0;i<business.Value.reviews.Count;i++) { Console.WriteLine(emClusterer.clusterInstance(instances.instance(i)) + "-->" + business.Value.reviews[i].text); }*/ //break; } XmlDocument xmlDoc = new XmlDocument(); XmlElement usersElt = xmlDoc.CreateElement("users"); xmlDoc.AppendChild(usersElt); foreach (KeyValuePair<string, User> user in users) { if (user.Value.friends.Count == 0) continue; if (string.IsNullOrEmpty(user.Value.name)) user.Value.name = "No Name"; XmlElement userElt = xmlDoc.CreateElement("user"); //id attribute XmlAttribute idAtt = xmlDoc.CreateAttribute("id"); idAtt.Value = user.Value.user_id; userElt.Attributes.Append(idAtt); //name XmlElement nameElt = xmlDoc.CreateElement("name"); nameElt.InnerText = user.Value.name; userElt.AppendChild(nameElt); //friends XmlElement friendsElt = xmlDoc.CreateElement("friends"); foreach (string user_id in user.Value.friends) { XmlElement friendElt = xmlDoc.CreateElement("friend"); friendElt.InnerText = user_id; friendsElt.AppendChild(friendElt); } userElt.AppendChild(friendsElt); //reviews XmlElement reviewsElt = xmlDoc.CreateElement("reviews"); foreach (Review review in user.Value.reviews) { XmlElement reviewElt = xmlDoc.CreateElement("review"); //text XmlElement textElt = xmlDoc.CreateElement("text"); textElt.InnerText = review.text; reviewElt.AppendChild(textElt); //lat XmlElement latElt = xmlDoc.CreateElement("lat"); latElt.InnerText = businesses[review.business_id].latitude.ToString(); reviewElt.AppendChild(latElt); //long XmlElement longElt = xmlDoc.CreateElement("long"); longElt.InnerText = businesses[review.business_id].longitude.ToString(); reviewElt.AppendChild(longElt); reviewsElt.AppendChild(reviewElt); //break; } userElt.AppendChild(reviewsElt); usersElt.AppendChild(userElt); } StreamWriter userlistwriter = new StreamWriter(@"F:\Data Science Project\users\userlist.txt"); userlistwriter.Write("<users>"); XmlNodeList usersList = xmlDoc.DocumentElement.SelectNodes("//user"); foreach (XmlNode user in usersList) { if (string.IsNullOrEmpty(user.Attributes["id"].InnerText)) continue; StreamWriter writer = new StreamWriter(@"F:\Data Science Project\users\" + user.Attributes["id"].InnerText + ".txt"); writer.Write(user.OuterXml); writer.Close(); userlistwriter.Write("<user id=\"" + user.Attributes["id"].InnerText + "\"><name>" + user.SelectSingleNode("name").InnerText + "</name></user>"); } userlistwriter.Write("</users>"); userlistwriter.Close(); }