Ejemplo n.º 1
0
 public void EndTrainingSession()
 {
     Console.WriteLine("End");
     stv = new StringToWordVector();
     stv.setAttributeNamePrefix("#");
     stv.setLowerCaseTokens(true);
     stv.setOutputWordCounts(true);
     stv.setInputFormat(oDataSet);
     stv.setStemmer(new weka.core.stemmers.LovinsStemmer());
     stv.setIDFTransform(true);
     dataSet = Filter.useFilter(oDataSet, stv);
     MultiLabelInstances mli = new MultiLabelInstances(dataSet, loadLabelsMeta(dataSet, tagsNb));
     BinaryRelevance br = new mulan.classifier.transformation.BinaryRelevance(new NaiveBayes());
     lps = new mulan.classifier.meta.RAkEL(br);
     br.setDebug(true);
     lps.setDebug(true);
     lps.build(mli);
 }
Ejemplo n.º 2
0
        private void RealizaTreinamentoWeka()
        {
            TempData["DataTreinamento"] = DateTime.Now;

            Instances dadosTreinamento = ImportaArquivosServidor();

            // Aplica o filter StringToWordVector
            weka.filters.Filter[] filters = new weka.filters.Filter[2];
            filters[0] = new StringToWordVector();
            filters[1] = AttributeSelectionFilter(2);

            weka.filters.MultiFilter filter = new weka.filters.MultiFilter();
            filter.setInputFormat(dadosTreinamento);
            filter.setFilters(filters);

            // Cria o Classificador a partir dos dados de treinamento
            FilteredClassifier classifier = new FilteredClassifier();

            classifier.setFilter(filter);
            classifier.setClassifier(new NaiveBayes());
            classifier.buildClassifier(dadosTreinamento);

            // Realiza um CrossValidation pra expôr as estatísticas do Classificador

            /*
             * Evaluation eval = new Evaluation(dadosTreinamento);
             * eval.crossValidateModel(classifier, dadosTreinamento, 10, new java.util.Random(1));
             *
             * var EstatisticasTexto =
             *  "Instâncias Corretas: \t" + eval.correct() + "  (" + Math.Round(eval.pctCorrect(),2) + "%)" + System.Environment.NewLine
             + "Instâncias Incorretas: \t" + eval.incorrect() + "  (" + Math.Round(eval.pctIncorrect(),2) + "%)" + System.Environment.NewLine
             + "Total de Instâncias: \t\t" + eval.numInstances();
             +
             +
             + TempData["TreinamentoRealizado"] = EstatisticasTexto;
             */

            // Salva o Classificador (model) como /Classificador/Classificador.model
            SerializationHelper.write(string.Format("{0}Classificador.model", DiretorioClassificadorServidor), classifier);

            // Salva os dados de treinamento como /Classificador/DadosTreinamento.arff
            Utilidades.SalvarArff(dadosTreinamento, DiretorioClassificadorServidor, "DadosTreinamento.arff");
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Filtered Support Vector Machine Classification with type specified. i.e. BOF or BOW
        /// </summary>
        /// <param name="type"></param>
        private void FilteredSVM(string type, string trainingFilePath, string directoryName, TextFilterType textFilterType)
        {
            var currDir = System.Environment.GetFolderPath(System.Environment.SpecialFolder.ApplicationData);

            // Combine the base folder with your specific folder....
            string specificFolder = System.IO.Path.Combine(currDir, "MARC 2.0");

            // Check if folder exists and if not, create it
            if (!Directory.Exists(specificFolder))
            {
                Directory.CreateDirectory(specificFolder);
            }



            try
            {
                var trainingDatatsetFilePath = "";
                if (type == "BOF")
                {
                    trainingDatatsetFilePath = specificFolder + "\\InputData\\TrainingDatasets\\BOF Dataset.arff";
                }
                else
                {
                    trainingDatatsetFilePath = specificFolder + "\\InputData\\TrainingDatasets\\BOW Dataset.arff";
                }

                var testDatasetFilePath = specificFolder + "\\InputData\\TrainingDatasets\\Test.arff";

                // If training file path is supplied then use it.
                if (trainingFilePath != null)
                {
                    trainingDatatsetFilePath = trainingFilePath;
                }

                java.io.BufferedReader trainReader    = new BufferedReader(new FileReader(trainingDatatsetFilePath)); //File with text examples
                BufferedReader         classifyReader = new BufferedReader(new FileReader(testDatasetFilePath));      //File with text to classify

                Instances trainInsts    = new Instances(trainReader);
                Instances classifyInsts = new Instances(classifyReader);

                trainInsts.setClassIndex(trainInsts.numAttributes() - 1);
                classifyInsts.setClassIndex(classifyInsts.numAttributes() - 1);

                FilteredClassifier model = new FilteredClassifier();

                StringToWordVector stringtowordvector = new StringToWordVector();
                stringtowordvector.setTFTransform(true);
                model.setFilter(new StringToWordVector());

                weka.classifiers.Classifier smocls = new weka.classifiers.functions.SMO();

                //smocls.setOptions(weka.core.Utils.splitOptions("-C 1.0 -L 0.001 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.Puk -C 250007 -O 1.0 -S 1.0\""));
                smocls.setOptions(weka.core.Utils.splitOptions("-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\""));
                model.setClassifier(smocls);

                bool exists;
                var  directoryRoot = System.IO.Path.GetDirectoryName(Directory.GetCurrentDirectory());
                directoryRoot = specificFolder;
                //Check if the model exists and if not then build a model
                switch (textFilterType)
                {
                case TextFilterType.NoFilter:
                    exists = SVMNoFilterCheckifModelExists(trainingDatatsetFilePath);

                    //if does not exists then build model and save it and save the file also for current filter
                    if (!exists)
                    {
                        model.buildClassifier(trainInsts);
                        Helper.Helper.WriteToBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMNoFilterModel.dat", model);
                        string content = System.IO.File.ReadAllText(trainingDatatsetFilePath);
                        using (var sW = new StreamWriter(directoryRoot + @"\Model\SVM\\SVMNoFilterFile.dat"))
                        {
                            sW.Write(content);
                        }
                    }
                    // if exists then read the file and use the model
                    else
                    {
                        model = Helper.Helper.ReadFromBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMNoFilterModel.dat");
                    }

                    break;

                //Case Stopwords Removal
                case TextFilterType.StopwordsRemoval:
                    exists = SVMSWRCheckifModelExists(trainingDatatsetFilePath);
                    //if does not exists then build model and save it and save the file also for current filter
                    if (!exists)
                    {
                        model.buildClassifier(trainInsts);
                        Helper.Helper.WriteToBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMSWRFilterModel.dat", model);
                        string content = System.IO.File.ReadAllText(trainingDatatsetFilePath);
                        using (var sW = new StreamWriter(directoryRoot + @"\Model\SVM\\SVMSWRFile.dat"))
                        {
                            sW.Write(content);
                        }
                    }
                    // if exists then read the file and use the model
                    else
                    {
                        model = Helper.Helper.ReadFromBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMSWRFilterModel.dat");
                    }

                    break;

                //Case Stemming
                case TextFilterType.Stemming:
                    exists = SVMSTCheckifModelExists(trainingDatatsetFilePath);
                    //if does not exists then build model and save it and save the file also for current filter
                    if (!exists)
                    {
                        model.buildClassifier(trainInsts);
                        Helper.Helper.WriteToBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMSTFilterModel.dat", model);
                        string content = System.IO.File.ReadAllText(trainingDatatsetFilePath);
                        using (var sW = new StreamWriter(directoryRoot + @"\Model\SVM\\SVMSTFile.dat"))
                        {
                            sW.Write(content);
                        }
                    }
                    // if exists then read the file and use the model
                    else
                    {
                        model = Helper.Helper.ReadFromBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMSTFilterModel.dat");
                    }
                    break;

                //Case Stopwords Removal with Stemming
                case TextFilterType.StopwordsRemovalStemming:
                    exists = SVMSWRSTCheckifModelExists(trainingDatatsetFilePath);
                    //if does not exists then build model and save it and save the file also for current filter
                    if (!exists)
                    {
                        model.buildClassifier(trainInsts);
                        Helper.Helper.WriteToBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMSWRSTFilterModel.dat", model);
                        string content = System.IO.File.ReadAllText(trainingDatatsetFilePath);
                        using (var sW = new StreamWriter(directoryRoot + @"\Model\SVM\\SVMSWRSTFile.dat"))
                        {
                            sW.Write(content);
                        }
                    }
                    // if exists then read the file and use the model
                    else
                    {
                        model = Helper.Helper.ReadFromBinaryFile <FilteredClassifier>(directoryRoot + @"\Model\SVM\SVMSWRSTFilterModel.dat");
                    }
                    break;

                default:
                    break;
                }

                //model.buildClassifier(trainInsts);
                for (int i = 0; i < classifyInsts.numInstances(); i++)
                {
                    classifyInsts.instance(i).setClassMissing();
                    double cls = model.classifyInstance(classifyInsts.instance(i));
                    classifyInsts.instance(i).setClassValue(cls);
                    classification = cls == 0 ? "Bug Report"
                                    : cls == 1 ? "Feature Request"
                                    : "Other";
                    tempAllClassification.Add(classification);
                }
                AllClassification = tempAllClassification;
            }
            catch (Exception o)
            {
                error = o.ToString();
            }
        }
Ejemplo n.º 4
0
        static void Main(string[] arg)
        {
            //Parse the businesses
            Dictionary<string, Business> businesses = new Dictionary<string, Business>();
            StreamReader reader = new StreamReader(@"F:\Data Science Project\businesses.txt");
            HashSet<string> set = new HashSet<string>();
            string line = reader.ReadLine();
            while (line != null)
            {
                if (line.Contains("\"type\": \"business\""))
                {
                    JObject obj = JObject.Parse(line);
                    if (obj["categories"].ToString().Contains("Restaurants"))
                    {
                        JArray array = JArray.Parse(obj["categories"].ToString());
                        foreach (JValue val in array)
                        {
                            set.Add(val.ToString());
                        }
                        Business business = new Business();
                        business.name = obj["name"].ToString();
                        business.full_address = obj["full_address"].ToString();
                        business.city = obj["city"].ToString();
                        business.state = obj["state"].ToString();
                        business.latitude = double.Parse(obj["latitude"].ToString());
                        business.longitude = double.Parse(obj["longitude"].ToString());
                        business.stars = float.Parse(obj["stars"].ToString());
                        business.review_count = int.Parse(obj["review_count"].ToString());
                        business.categories = obj["categories"].ToString();
                        businesses.Add(obj["business_id"].ToString(), business);
                    }
                }
                line = reader.ReadLine();
            }

            //Parse the reviews
            reader = new StreamReader(@"F:\Data Science Project\reviews.txt");
            Dictionary<string, User> users = new Dictionary<string, User>();
            line = reader.ReadLine();
            while (line != null)
            {
                if (line.Contains("\"type\": \"review\""))
                {
                    JObject obj = JObject.Parse(line);
                    if (businesses.ContainsKey(obj["business_id"].ToString()) && businesses[obj["business_id"].ToString()].categories.Contains("Restaurants"))
                    {
                        Review review = new Review();
                        if (users.ContainsKey(obj["user_id"].ToString()))
                        {
                            review.text = RemoveSpecialCharacters(obj["text"].ToString());
                            review.business_id = obj["business_id"].ToString();
                            review.stars = float.Parse(obj["stars"].ToString());
                            review.user_id = obj["user_id"].ToString();
                            users[obj["user_id"].ToString()].reviews.Add(review);
                        }
                        else
                        {
                            review.text = RemoveSpecialCharacters(obj["text"].ToString());
                            review.business_id = obj["business_id"].ToString();
                            review.stars = float.Parse(obj["stars"].ToString());
                            review.user_id = obj["user_id"].ToString();
                            User user = new User();
                            user.reviews.Add(review);
                            users.Add(obj["user_id"].ToString(), user);
                        }
                        businesses[obj["business_id"].ToString()].reviews.Add(review);
                    }
                }
                line = reader.ReadLine();
            }

            //Parse the users
            reader = new StreamReader(@"F:\Data Science Project\users.txt");
            line = reader.ReadLine();
            while (line != null)
            {
                if (line.Contains("\"type\": \"user\""))
                {
                    JObject obj = JObject.Parse(line);
                    if (users.ContainsKey(obj["user_id"].ToString()))
                    {
                        users[obj["user_id"].ToString()].name = obj["name"].ToString();
                        users[obj["user_id"].ToString()].user_id = obj["user_id"].ToString();
                        users[obj["user_id"].ToString()].average_stars = float.Parse(obj["average_stars"].ToString());
                        users[obj["user_id"].ToString()].review_count = int.Parse(obj["review_count"].ToString());
                        users[obj["user_id"].ToString()].funny_votes = int.Parse(obj["votes"]["funny"].ToString());
                        users[obj["user_id"].ToString()].useful_votes = int.Parse(obj["votes"]["useful"].ToString());
                        users[obj["user_id"].ToString()].cool_votes = int.Parse(obj["votes"]["cool"].ToString());
                    }
                }
                line = reader.ReadLine();
            }

            int z = 0;
            foreach (KeyValuePair<string, Business> business in businesses)
            {
                z++;
                //Console.WriteLine(++z);
                //if (z == 10) break;
                //KeyValuePair<string, Business> business = new KeyValuePair<string, Business>("3vKhV2ELR2hmwlnoNqYWaA", businesses["3vKhV2ELR2hmwlnoNqYWaA"]);
                StringBuilder sb = new StringBuilder();
                sb.AppendLine("@relation reviews");
                sb.AppendLine("@attribute user_id string");
                sb.AppendLine("@attribute text string");
                sb.AppendLine("@attribute stars numeric");
                sb.AppendLine("@data");
                foreach (Review review in business.Value.reviews)
                {
                    sb.Append(review.user_id + ",\"" + review.text + "\"" + "," + review.stars);
                    sb.AppendLine("");
                }
                Instances instances = new Instances(new java.io.StringReader(sb.ToString()));
                //Create nominal filter for the user_id attribute
                StringToNominal nominalFilter = new StringToNominal();
                String[] options = weka.core.Utils.splitOptions("-R first");
                nominalFilter.setOptions(options);
                nominalFilter.setInputFormat(instances);
                //apply the filter
                instances = weka.filters.Filter.useFilter(instances, nominalFilter);

                //Create string to word vector filter for the text attribute
                StringToWordVector stwFilter = new StringToWordVector();
                options = weka.core.Utils.splitOptions("-R first-last -P att_ -W 1000 -prune-rate -1.0 -T -I -N 0 -L -stemmer weka.core.stemmers.NullStemmer -M 2 -tokenizer weka.core.tokenizers.WordTokenizer -delimiters \\r\\n\\t.,;:\\\'\\\"()?!\"\"");
                stwFilter.setOptions(options);
                stwFilter.setInputFormat(instances);
                //apply the filter
                instances = weka.filters.Filter.useFilter(instances, stwFilter);

                SimpleKMeans kmeansClusterer = new SimpleKMeans();
                options = weka.core.Utils.splitOptions("-N " + Math.Ceiling(business.Value.reviews.Count / (decimal)5).ToString() + " -A \"weka.core.EuclideanDistance -R first-last\" -I 5 -S 10");
                kmeansClusterer.setOptions(options);
                kmeansClusterer.setPreserveInstancesOrder(true);
                kmeansClusterer.buildClusterer(instances);
                int[] assignments = kmeansClusterer.getAssignments();

                for (int j = 0; j < assignments.Length; j++)
                {
                    for (int k = 0; k < assignments.Length; k++)
                    {
                        if (j != k && assignments[j] == assignments[k])
                        {
                            users[business.Value.reviews[j].user_id].friends.Add(business.Value.reviews[k].user_id);
                        }
                    }
                }

                /*int i = 0;
                foreach (int clusterNum in assignments)
                {
                    string str = "";
                    foreach (string u in users[business.Value.reviews[i].user_id].friends)
                    {
                        str += u + ",";
                    }
                    Console.WriteLine(clusterNum + "-->" + business.Value.reviews[i].user_id + "-->" + str);
                    i++;
                }*/

                /*EM emClusterer = new EM();
                options = weka.core.Utils.splitOptions("-I 100 -N -1 -M 1.0E-6 -S 100");
                emClusterer.setOptions(options);
                emClusterer.buildClusterer(instances);

                for (int i=0;i<business.Value.reviews.Count;i++)
                {
                    Console.WriteLine(emClusterer.clusterInstance(instances.instance(i)) + "-->" + business.Value.reviews[i].text);
                }*/

                //break;
            }

            XmlDocument xmlDoc = new XmlDocument();
            XmlElement usersElt = xmlDoc.CreateElement("users");
            xmlDoc.AppendChild(usersElt);
            foreach (KeyValuePair<string, User> user in users)
            {
                if (user.Value.friends.Count == 0) continue;
                if (string.IsNullOrEmpty(user.Value.name))
                    user.Value.name = "No Name";
                XmlElement userElt = xmlDoc.CreateElement("user");

                //id attribute
                XmlAttribute idAtt = xmlDoc.CreateAttribute("id");
                idAtt.Value = user.Value.user_id;
                userElt.Attributes.Append(idAtt);

                //name
                XmlElement nameElt = xmlDoc.CreateElement("name");
                nameElt.InnerText = user.Value.name;
                userElt.AppendChild(nameElt);

                //friends
                XmlElement friendsElt = xmlDoc.CreateElement("friends");
                foreach (string user_id in user.Value.friends)
                {
                    XmlElement friendElt = xmlDoc.CreateElement("friend");
                    friendElt.InnerText = user_id;
                    friendsElt.AppendChild(friendElt);
                }
                userElt.AppendChild(friendsElt);

                //reviews
                XmlElement reviewsElt = xmlDoc.CreateElement("reviews");
                foreach (Review review in user.Value.reviews)
                {
                    XmlElement reviewElt = xmlDoc.CreateElement("review");
                    //text
                    XmlElement textElt = xmlDoc.CreateElement("text");
                    textElt.InnerText = review.text;
                    reviewElt.AppendChild(textElt);

                    //lat
                    XmlElement latElt = xmlDoc.CreateElement("lat");
                    latElt.InnerText = businesses[review.business_id].latitude.ToString();
                    reviewElt.AppendChild(latElt);

                    //long
                    XmlElement longElt = xmlDoc.CreateElement("long");
                    longElt.InnerText = businesses[review.business_id].longitude.ToString();
                    reviewElt.AppendChild(longElt);

                    reviewsElt.AppendChild(reviewElt);

                    //break;
                }
                userElt.AppendChild(reviewsElt);
                usersElt.AppendChild(userElt);
            }

            StreamWriter userlistwriter = new StreamWriter(@"F:\Data Science Project\users\userlist.txt");
            userlistwriter.Write("<users>");
            XmlNodeList usersList = xmlDoc.DocumentElement.SelectNodes("//user");
            foreach (XmlNode user in usersList)
            {
                if (string.IsNullOrEmpty(user.Attributes["id"].InnerText)) continue;
                StreamWriter writer = new StreamWriter(@"F:\Data Science Project\users\" + user.Attributes["id"].InnerText + ".txt");
                writer.Write(user.OuterXml);
                writer.Close();

                userlistwriter.Write("<user id=\"" + user.Attributes["id"].InnerText + "\"><name>" + user.SelectSingleNode("name").InnerText + "</name></user>");
            }
            userlistwriter.Write("</users>");
            userlistwriter.Close();
        }