static void Main(string[] args) { Random rnd = new Random(1); string[] featureNames = "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma,ari,flesch,fog,rWords,rChars,rSyllables,rComplex,M04,M05,M06,M07,M08,M09,M10,M11,M12,M13".Split(','); LabeledDataset <BlogMetaData, SparseVector <double> > dataset = new LabeledDataset <BlogMetaData, SparseVector <double> >(); Console.WriteLine("Analiziram besedila..."); foreach (string fileName in Directory.GetFiles(Config.DataFolder, "*.xml")) { // load XML Console.WriteLine("Datoteka {0}...", fileName); XmlDocument doc = new XmlDocument(); doc.LoadXml(File.ReadAllText(fileName).Replace("xmlns=\"http://www.tei-c.org/ns/1.0\"", "")); Corpus corpus = new Corpus(); corpus.LoadFromXmlFile(fileName, /*tagLen=*/ int.MaxValue); #if TEST_CHUNKER Text text = null; #else Text text = new Text(corpus, doc.SelectSingleNode("//header/naslov").InnerText, doc.SelectSingleNode("//header/blog").InnerText /*blog identifier is used as author identifier*/); text.ComputeFeatures(); // compute Detextive features #endif // run chunker Console.WriteLine("Racunam znacilke..."); ArrayList <Chunk> chunks = Chunker.GetChunks(doc); chunks = new ArrayList <Chunk>(chunks.Where(x => !x.mInner)); // get non-inner chunks only chunks.ForEach(x => x.mType = MapChunkType(x.mType)); // move chunks from Other_* to main categories #if TEST_CHUNKER return; #endif // get blog meta-data BlogMetaData metaData = new BlogMetaData(); metaData.mAuthorAge = doc.SelectSingleNode("//header/avtorStarost").InnerText; metaData.mAuthorEducation = doc.SelectSingleNode("//header/avtorIzobrazba").InnerText; metaData.mAuthorGender = doc.SelectSingleNode("//header/avtorSpol").InnerText; metaData.mAuthorLocation = doc.SelectSingleNode("//header/avtorRegija").InnerText; metaData.mBlog = doc.SelectSingleNode("//header/blog").InnerText; // compute features M04-M13 from Stamatatos et al.: Automatic Text Categorization in Terms of Genre and Author (2000) double totalChunks = chunks.Count; double[] M = new double[10]; double numNP = chunks.Count(x => x.mType == ChunkType.NP); double numVP = chunks.Count(x => x.mType == ChunkType.VP); double numAP = chunks.Count(x => x.mType == ChunkType.AP); double numPP = chunks.Count(x => x.mType == ChunkType.PP); double numCON = chunks.Count(x => x.mType == ChunkType.CON); if (totalChunks > 0) { M[0] = numNP / totalChunks; M[1] = numVP / totalChunks; M[2] = numAP / totalChunks; M[3] = numPP / totalChunks; M[4] = numCON / totalChunks; } double numWordsNP = chunks.Where(x => x.mType == ChunkType.NP).Select(x => x.mItems.Count).Sum(); M[5] = numNP == 0 ? 0 : (numWordsNP / numNP); double numWordsVP = chunks.Where(x => x.mType == ChunkType.VP).Select(x => x.mItems.Count).Sum(); M[6] = numVP == 0 ? 0 : (numWordsVP / numVP); double numWordsAP = chunks.Where(x => x.mType == ChunkType.AP).Select(x => x.mItems.Count).Sum(); M[7] = numAP == 0 ? 0 : (numWordsAP / numAP); double numWordsPP = chunks.Where(x => x.mType == ChunkType.PP).Select(x => x.mItems.Count).Sum(); M[8] = numPP == 0 ? 0 : (numWordsPP / numPP); double numWordsCON = chunks.Where(x => x.mType == ChunkType.CON).Select(x => x.mItems.Count).Sum(); M[9] = numCON == 0 ? 0 : (numWordsCON / numCON); // create dataset SparseVector <double> vec = new SparseVector <double>(); int i = 0; foreach (string featureName in "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma,ari,flesch,fog,rWords,rChars,rSyllables,rComplex".Split(',')) { if (double.IsNaN(text.mFeatures[featureName]) || double.IsInfinity(text.mFeatures[featureName])) { vec[i++] = 0; } else { vec[i++] = text.mFeatures[featureName]; } } foreach (double val in M) { vec[i++] = val; } dataset.Add(new LabeledExample <BlogMetaData, SparseVector <double> >(metaData, vec)); string htmlFileName = Config.HtmlFolder + "\\" + Path.GetFileNameWithoutExtension(fileName) + ".html"; Output.SaveHtml(featureNames, vec, doc, chunks, htmlFileName); } // save as Orange and Weka file Console.WriteLine("Zapisujem datoteke Weka ARFF in Orange TAB..."); foreach (ClassType classType in new ClassType[] { ClassType.AuthorName, ClassType.AuthorAge, ClassType.AuthorGender, ClassType.AuthorEducation, ClassType.AuthorLocation }) { Output.SaveArff(featureNames, dataset, classType, Config.OutputFolder + "\\" + string.Format("OPA-{0}.arff", classType)); Output.SaveTab(featureNames, dataset, classType, Config.OutputFolder + "\\" + string.Format("OPA-{0}.tab", classType)); } // evaluate features via classification Console.WriteLine("Evalviram znacilke s klasifikacijskimi modeli..."); PerfData <string> perfData = new PerfData <string>(); ArrayList <Pair <string, IModel <string> > > models = new ArrayList <Pair <string, IModel <string> > >(); // create classifiers NearestCentroidClassifier <string> ncc = new NearestCentroidClassifier <string>(); ncc.Similarity = new SingleFeatureSimilarity(); models.Add(new Pair <string, IModel <string> >("NCC", ncc)); //KnnClassifier<string, SparseVector<double>> knn = new KnnClassifier<string, SparseVector<double>>(new SingleFeatureSimilarity()); //models.Add(new Pair<string, IModel<string>>("kNN", knn)); // *** kNN is too slow SvmMulticlassClassifier <string> svm = new SvmMulticlassClassifier <string>(); models.Add(new Pair <string, IModel <string> >("SVM", svm)); MajorityClassifier <string, SparseVector <double> > maj = new MajorityClassifier <string, SparseVector <double> >(); models.Add(new Pair <string, IModel <string> >("Majority", maj)); MajorityClassifier <string, SparseVector <double> > backupCfy = new MajorityClassifier <string, SparseVector <double> >(); foreach (Pair <string, IModel <string> > modelInfo in models) // iterate over different classifiers { Console.WriteLine("Kasifikacijski model: {0}...", modelInfo.First); foreach (ClassType classType in new ClassType[] { ClassType.AuthorName, ClassType.AuthorAge, ClassType.AuthorEducation, ClassType.AuthorGender, ClassType.AuthorLocation }) // iterate over different class types { Console.WriteLine("Ciljni razred: {0}...", classType); for (int fIdx = 0; fIdx < featureNames.Count(); fIdx++) // iterate over different features { Console.WriteLine("Znacilka: {0}...", featureNames[fIdx]); LabeledDataset <string, SparseVector <double> > datasetWithSingleFeature = CreateSingleFeatureDataset(dataset, classType, fIdx); datasetWithSingleFeature.Shuffle(rnd); LabeledDataset <string, SparseVector <double> > trainSet, testSet; for (int foldNum = 1; foldNum <= 10; foldNum++) { Console.WriteLine("Sklop " + foldNum + " / 10..."); datasetWithSingleFeature.SplitForCrossValidation(/*numFolds=*/ 10, foldNum, out trainSet, out testSet); IModel <string> model = modelInfo.Second; backupCfy.Train(trainSet); // if there is only one class in trainSet, switch to MajorityClassifier if (((IEnumerable <LabeledExample <string, SparseVector <double> > >)trainSet).Select(x => x.Label).Distinct().Count() == 1) { model = backupCfy; } else { string cacheFileName = Config.OutputFolder + "\\svm-" + classType + "-" + featureNames[fIdx] + "-" + foldNum + ".bin"; if (model is SvmMulticlassClassifier <string> && File.Exists(cacheFileName)) { using (BinarySerializer bs = new BinarySerializer(cacheFileName, FileMode.Open)) { ((SvmMulticlassClassifier <string>)model).Load(bs); } } else { model.Train(trainSet); } #if CACHE_MODELS if (model is SvmMulticlassFast <string> ) { using (BinarySerializer bs = new BinarySerializer(cacheFileName, FileMode.Create)) { model.Save(bs); } } #endif } foreach (LabeledExample <string, SparseVector <double> > lblEx in testSet) { Prediction <string> pred = model.Predict(lblEx.Example); if (pred.Count == 0) { pred = backupCfy.Predict(lblEx.Example); } // if the model is unable to make a prediction, use MajorityClassifier instead perfData.GetPerfMatrix(classType.ToString(), modelInfo.First + "\t" + featureNames[fIdx], foldNum).AddCount(lblEx.Label, pred.BestClassLabel); } } } } } // train full models Console.WriteLine("Treniram klasifikacijske modele..."); models.Clear(); SvmMulticlassClassifier <string> svmFull = new SvmMulticlassClassifier <string>(); models.Add(new Pair <string, IModel <string> >("SVM", svmFull)); //NearestCentroidClassifier<string> nccFull = new NearestCentroidClassifier<string>(); //nccFull.Similarity = new ManhattanSimilarity(); //models.Add(new Pair<string, IModel<string>>("NCC", nccFull)); foreach (Pair <string, IModel <string> > modelInfo in models) // iterate over different classifiers { Console.WriteLine("Kasifikacijski model: {0}...", modelInfo.First); IModel <string> model = modelInfo.Second; foreach (ClassType classType in new ClassType[] { ClassType.AuthorName, ClassType.AuthorAge, ClassType.AuthorEducation, ClassType.AuthorGender, ClassType.AuthorLocation }) // iterate over different class types { Console.WriteLine("Ciljni razred: {0}...", classType); LabeledDataset <string, SparseVector <double> > nrmDataset = CreateNormalizedDataset(dataset, classType); nrmDataset.Shuffle(rnd); LabeledDataset <string, SparseVector <double> > trainSet, testSet; for (int foldNum = 1; foldNum <= 10; foldNum++) { Console.WriteLine("Sklop " + foldNum + " / 10..."); nrmDataset.SplitForCrossValidation(/*numFolds=*/ 10, foldNum, out trainSet, out testSet); backupCfy.Train(trainSet); // if there is only one class in trainSet, switch to MajorityClassifier if (((IEnumerable <LabeledExample <string, SparseVector <double> > >)trainSet).Select(x => x.Label).Distinct().Count() == 1) { model = backupCfy; } else { string cacheFileName = Config.OutputFolder + "\\svm-" + classType + "-full-" + foldNum + ".bin"; if (model is SvmMulticlassClassifier <string> && File.Exists(cacheFileName)) { using (BinarySerializer bs = new BinarySerializer(cacheFileName, FileMode.Open)) { ((SvmMulticlassClassifier <string>)model).Load(bs); } } else { model.Train(trainSet); } #if CACHE_MODELS if (model is SvmMulticlassFast <string> ) { using (BinarySerializer bs = new BinarySerializer(cacheFileName, FileMode.Create)) { model.Save(bs); } } #endif } foreach (LabeledExample <string, SparseVector <double> > lblEx in testSet) { Prediction <string> pred = model.Predict(lblEx.Example); if (pred.Count == 0) { pred = backupCfy.Predict(lblEx.Example); } // if the model is unable to make a prediction, use MajorityClassifier instead perfData.GetPerfMatrix(classType.ToString(), modelInfo.First + "\tfull", foldNum).AddCount(lblEx.Label, pred.BestClassLabel); } } // save model string modelFileName = Config.OutputFolder + "\\" + modelInfo.First + "-" + classType + ".model"; if (!File.Exists(modelFileName)) { using (BinarySerializer bs = new BinarySerializer(modelFileName, FileMode.Create)) { model.Train(nrmDataset); model.Save(bs); } } } } using (StreamWriter w = new StreamWriter(Config.OutputFolder + "\\ClassifierEval.txt")) { w.WriteLine("*** Macro F1 ***"); w.WriteLine(); w.WriteLine("\t" + perfData.ToString(null, PerfMetric.MacroF1)); w.WriteLine(); w.WriteLine("*** Micro F1 ***"); w.WriteLine(); w.WriteLine("\t" + perfData.ToString(null, PerfMetric.MicroF1)); w.WriteLine(); w.WriteLine("*** Macro accuracy ***"); w.WriteLine(); w.WriteLine("\t" + perfData.ToString(null, PerfMetric.MacroAccuracy)); w.WriteLine(); w.WriteLine("*** Micro accuracy ***"); w.WriteLine(); w.WriteLine("\t" + perfData.ToString(null, PerfMetric.MicroAccuracy)); } // all done Console.WriteLine("Koncano."); }
public override void Run(object[] args) { int foldCount = args.Any() ? (int)args[0] : 10; args = args.Skip(1).ToArray(); // get classifier and labeled data BinarySvm classifierInst = BinarySvm.RunInstanceNull(args); var classifier = (SvmBinaryClassifier <string>)classifierInst.Result["classifier"]; var labeledData = (LabeledDataset <string, SparseVector <double> >)classifierInst.Result["labeled_data"]; bool stratified = true; // cross validation if (stratified) { labeledData.GroupLabels(true); } else { labeledData.Shuffle(new Random(1)); } var perfData = new PerfData <string>(); foreach (var g in labeledData.GroupBy(le => le.Label)) { Output.WriteLine("total {0} {1}\t {2:0.00}", g.Key, g.Count(), (double)g.Count() / labeledData.Count); } Output.WriteLine("Performing {0}{1}-fold cross validation...", stratified ? "stratified " : "", foldCount); for (int i = 0; i < foldCount; i++) { int foldN = i + 1; LabeledDataset <string, SparseVector <double> > testSet; LabeledDataset <string, SparseVector <double> > trainSet; if (stratified) { labeledData.SplitForStratifiedCrossValidation(foldCount, foldN, out trainSet, out testSet); } else { labeledData.SplitForCrossValidation(foldCount, foldN, out trainSet, out testSet); } classifier.Train(trainSet); PerfMatrix <string> foldMatrix = perfData.GetPerfMatrix("tutorial", "binary svm", foldN); foreach (LabeledExample <string, SparseVector <double> > labeledExample in testSet) { Prediction <string> prediction = classifier.Predict(labeledExample.Example); foldMatrix.AddCount(labeledExample.Label, prediction.BestClassLabel); } Output.WriteLine("Accuracy for {0}-fold: {1:0.00}", foldN, foldMatrix.GetAccuracy()); } Output.WriteLine("Sum confusion matrix:"); PerfMatrix <string> sumPerfMatrix = perfData.GetSumPerfMatrix("tutorial", "binary svm"); Output.WriteLine(sumPerfMatrix.ToString()); Output.WriteLine("Average accuracy: {0:0.00}", sumPerfMatrix.GetAccuracy()); Output.WriteLine(); Output.WriteLine(sumPerfMatrix.ToString(new PerfMetric[] { })); Output.WriteLine(sumPerfMatrix.ToString(perfData.GetLabels("tutorial", "binary svm"), new OrdinalPerfMetric[] { })); Output.WriteLine(sumPerfMatrix.ToString(new ClassPerfMetric[] { })); foreach (string label in perfData.GetLabels("tutorial", "binary svm")) { double stdDev; Output.WriteLine("Precision for '{0}': {1:0.00} std. dev: {2:0.00}", label, perfData.GetAvg("tutorial", "binary svm", ClassPerfMetric.Precision, label, out stdDev), stdDev); } }