public CategoryProbabilityDistribution(TextSource referenceSource, ISmoothingTechnique smoothing, int n)
     _smoothing = smoothing;
     _referenceSource = referenceSource;
 public CategoryProbabilityDistribution(TextSource referenceSource, ISmoothingTechnique smoothing, int n)
     _smoothing       = smoothing;
     _referenceSource = referenceSource;
Ejemplo n.º 3
        public void Start(string mainFolder, string outputDir, ISmoothingTechnique smoothing, int nGramSize, ILanguageProcessor processor)
            var resultSet = new ResultSet();
            var bayesClassifier = new BayesTextClassifier();

            var docReader = new ReadDocumentFromTiraFile();

            var categories = new List<TextSource>();

            dynamic jsonConfig;

            using (StreamReader sr = new StreamReader(Path.Combine(mainFolder, "meta-file.json")))
                jsonConfig = JsonConvert.DeserializeObject(sr.ReadToEnd());

            var unknownFolder = (string)jsonConfig.folder;
            var encodingString = (string)jsonConfig.encoding;
            var cultureString = (string)jsonConfig.language;

            CultureInfo ci = null;
            switch (cultureString)
                case "EN":
                    ci = new CultureInfo("en-US");
                    throw new ApplicationException("culture not found");

            Encoding encoding = null;
            switch (encodingString)
                case "UTF8":
                    encoding = Encoding.UTF8;
                case "ASCII":
                    encoding = Encoding.ASCII;
                    throw new ApplicationException("encoding not found");

            foreach(var item in jsonConfig["candidate-authors"])
                var authorName = (string)item["author-name"];
                var docs = new DirectoryInfo(Path.Combine(mainFolder, authorName)).GetFiles();
                var dataSource = new List<string>();

                foreach (var doc in docs)
                        dataSource.Add(docReader.ReadDocumentText(doc.FullName, encoding, ci));
                        Console.WriteLine("Document {0} unreadable", doc.FullName);

                categories.Add(processor.Process(dataSource, authorName));

            int n = nGramSize;

            Console.WriteLine("Scanned {1} documents in {0} categories", categories.Count, categories.Select(el => el.Documents.Count).Aggregate((el1, el2) => el1 + el2));

            var allInOne = new TextSource();
            allInOne.Documents.AddRange(categories.SelectMany(el => el.Documents));

            Console.WriteLine("Building hash tables ..", n);

            Parallel.ForEach(categories, category =>
                for (int i = 1; i <= n; i++)
                    Console.WriteLine("hashed {0} with n={1}", category.Name, i);

            allInOne.SetNGramCache(NGramCache.Aggregate(categories.Select(el => el.GetNGramCache())));
            Console.WriteLine("aggregated hashing");

            Console.WriteLine("Getting smoothing ready ..");
            smoothing.Init(allInOne, nGramSize);
            var categoriesToTest = new Dictionary<TextSource, CategoryProbabilityDistribution>();

            foreach (var cat in categories)
                categoriesToTest[cat] = new CategoryProbabilityDistribution(cat, smoothing, n);

            Console.WriteLine("Start classifying ..");
            int totalProgress = jsonConfig["unknown-texts"].Count * categoriesToTest.Count;
            int progress = 0;

            System.Timers.Timer t = new System.Timers.Timer(5000);
            t.Elapsed += (sender, eventArgs) =>
                Console.Title = "Task is Running. Progress: " + Math.Round((((double)progress / (double)totalProgress) * 100.0), 2).ToString() + "%";

            t.AutoReset = true;

            foreach (var item in jsonConfig["unknown-texts"])
                TextSource topCategory = null;
                var maxProb = 0.0;
                var textName = (string)item["unknown-text"];
                var probs = new List<double>();

                Parallel.ForEach(categoriesToTest, catDist =>
                    var docText = new string[] { docReader.ReadDocumentText(Path.Combine(mainFolder, unknownFolder, textName), encoding, ci) };
                    var docSource = processor.Process(docText, "unknown").Documents.First();

                    double p = bayesClassifier.P_c(catDist.Value, docSource, n, (double)catDist.Key.Documents.Count / (double)allInOne.Documents.Count);

                    lock (probs)

                        if (topCategory == null || p > maxProb)
                            topCategory = catDist.Key;
                            maxProb = p;

                    Interlocked.Increment(ref progress);

                // getting the score
                double pre_score = 0.0;
                double max_sub_score = 0.0;

                foreach (var p in probs)
                    var subScore = Math.Abs((maxProb - p) / maxProb) * Math.Pow(Math.E, 3); // normalized difference
                    var eSubScore = Math.Exp(-subScore);
                    pre_score += eSubScore;

                    if (eSubScore > max_sub_score)
                        max_sub_score = eSubScore;

                double score = Math.Round(1.0 - (0.5 * (pre_score / probs.Count) + 0.5 * max_sub_score), 2);

                Console.WriteLine("Classified {0} as author {1} with score {2}", textName, topCategory.Name, score);
                resultSet.answers.Add(new Result(textName, topCategory.Name, score));

                Console.WriteLine("writing data to file ...");
                string data = JsonConvert.SerializeObject(resultSet, Formatting.Indented);
                using (StreamWriter sw = new StreamWriter(Path.Combine(outputDir, "answers.json"), false))
Ejemplo n.º 4
        public void Start(ILanguageProcessor processor, ISmoothingTechnique smoothing)
            var baseDirectory = AppDomain.CurrentDomain.BaseDirectory;
            var bayesClassifier = new BayesTextClassifier();

            var docReader = new ReadDocumentFromXmlFile();
            var docPath = Path.Combine(baseDirectory, "data");
            var authors = new DirectoryInfo(docPath).GetDirectories();

            var categories = new List<TextSource>();

            // Prepare data
            foreach (var item in authors)
                var docs = item.GetFiles();
                var dataSource = new List<string>();

                foreach (var doc in docs)
                        dataSource.Add(docReader.ReadDocumentText(doc.FullName, Encoding.GetEncoding(1253), new CultureInfo("el-GR")));
                        Console.WriteLine("Document {0} unreadable", doc.FullName);

                categories.Add(processor.Process(dataSource, item.Name));

            Console.WriteLine("Scanned {1} documents in {0} categories", categories.Count, categories.Select(el => el.Documents.Count).Aggregate((el1, el2) => el1 + el2));

            var testPath = Path.Combine(baseDirectory, "test");
            var testAuthors = new DirectoryInfo(testPath).GetDirectories();
            var allInOne = new TextSource();
            allInOne.Documents.AddRange(categories.SelectMany(el => el.Documents));

            // choose n from 1 to 4
            for (int n = 1; n <= 4; n++)
                Console.WriteLine("-----PREPARE for n = {0}", n);
                Console.WriteLine("Building hash tables ..", n);

                Parallel.ForEach(categories, category =>

                allInOne.SetNGramCache(NGramCache.Aggregate(categories.Select(el => el.GetNGramCache())));

                Console.WriteLine("Getting smoothing ready ..");
                smoothing.Init(allInOne, n);
                var categoriesToTest = new Dictionary<TextSource, CategoryProbabilityDistribution>();

                foreach(var cat in categories)
                    categoriesToTest[cat] = new CategoryProbabilityDistribution(cat, smoothing, n);

                int rightClassified = 0;
                int wrongClassified = 0;

                Console.WriteLine("-----Algorithm starts now");
                foreach (var testAuthor in testAuthors)
                    foreach (var testDocument in testAuthor.GetFiles())
                        TextSource topCategory = null;
                        var maxProb = 0.0;

                       Parallel.ForEach(categoriesToTest, catDist =>
                           var docText = new[] { docReader.ReadDocumentText(testDocument.FullName, Encoding.GetEncoding(1253), new CultureInfo("el-GR")) };
                           var docSource = processor.Process(docText, testAuthor.Name).Documents.First();

                           double p = bayesClassifier.P_c(catDist.Value, docSource, n, 1.0 / (double)categories.Count);

                           if (topCategory == null || p > maxProb)
                               topCategory = catDist.Key;
                               maxProb = p;

                        Console.WriteLine("Classified {0} as author {1} - {2}", testDocument.Name, topCategory.Name, topCategory.Name == testAuthor.Name ? "correct" : "incorrect");

                        if (topCategory.Name == testAuthor.Name) rightClassified++;
                        else wrongClassified++;

                Console.WriteLine("Success rate for n={0} is {1}\n", n, (double)rightClassified / ((double)rightClassified + (double)wrongClassified));
Ejemplo n.º 5
        public void Start(string mainFolder, string outputDir, ISmoothingTechnique smoothing, int nGramSize, ILanguageProcessor processor)
            var resultSet       = new ResultSet();
            var bayesClassifier = new BayesTextClassifier();

            var docReader = new ReadDocumentFromTiraFile();

            var categories = new List <TextSource>();

            dynamic jsonConfig;

            using (StreamReader sr = new StreamReader(Path.Combine(mainFolder, "meta-file.json")))
                jsonConfig = JsonConvert.DeserializeObject(sr.ReadToEnd());

            var unknownFolder  = (string)jsonConfig.folder;
            var encodingString = (string)jsonConfig.encoding;
            var cultureString  = (string)jsonConfig.language;

            CultureInfo ci = null;

            switch (cultureString)
            case "EN":
                ci = new CultureInfo("en-US");

                throw new ApplicationException("culture not found");

            Encoding encoding = null;

            switch (encodingString)
            case "UTF8":
                encoding = Encoding.UTF8;

            case "ASCII":
                encoding = Encoding.ASCII;

                throw new ApplicationException("encoding not found");

            foreach (var item in jsonConfig["candidate-authors"])
                var authorName = (string)item["author-name"];
                var docs       = new DirectoryInfo(Path.Combine(mainFolder, authorName)).GetFiles();
                var dataSource = new List <string>();

                foreach (var doc in docs)
                        dataSource.Add(docReader.ReadDocumentText(doc.FullName, encoding, ci));
                        Console.WriteLine("Document {0} unreadable", doc.FullName);

                categories.Add(processor.Process(dataSource, authorName));

            int n = nGramSize;

            Console.WriteLine("Scanned {1} documents in {0} categories", categories.Count, categories.Select(el => el.Documents.Count).Aggregate((el1, el2) => el1 + el2));

            var allInOne = new TextSource();

            allInOne.Documents.AddRange(categories.SelectMany(el => el.Documents));

            Console.WriteLine("Building hash tables ..", n);

            Parallel.ForEach(categories, category =>
                for (int i = 1; i <= n; i++)
                    Console.WriteLine("hashed {0} with n={1}", category.Name, i);

            allInOne.SetNGramCache(NGramCache.Aggregate(categories.Select(el => el.GetNGramCache())));
            Console.WriteLine("aggregated hashing");

            Console.WriteLine("Getting smoothing ready ..");
            smoothing.Init(allInOne, nGramSize);
            var categoriesToTest = new Dictionary <TextSource, CategoryProbabilityDistribution>();

            foreach (var cat in categories)
                categoriesToTest[cat] = new CategoryProbabilityDistribution(cat, smoothing, n);

            Console.WriteLine("Start classifying ..");
            int totalProgress = jsonConfig["unknown-texts"].Count * categoriesToTest.Count;
            int progress      = 0;

            System.Timers.Timer t = new System.Timers.Timer(5000);
            t.Elapsed += (sender, eventArgs) =>
                Console.Title = "Task is Running. Progress: " + Math.Round((((double)progress / (double)totalProgress) * 100.0), 2).ToString() + "%";

            t.AutoReset = true;

            foreach (var item in jsonConfig["unknown-texts"])
                TextSource topCategory = null;
                var        maxProb     = 0.0;
                var        textName    = (string)item["unknown-text"];
                var        probs       = new List <double>();

                Parallel.ForEach(categoriesToTest, catDist =>
                    var docText   = new string[] { docReader.ReadDocumentText(Path.Combine(mainFolder, unknownFolder, textName), encoding, ci) };
                    var docSource = processor.Process(docText, "unknown").Documents.First();

                    double p = bayesClassifier.P_c(catDist.Value, docSource, n, (double)catDist.Key.Documents.Count / (double)allInOne.Documents.Count);

                    lock (probs)

                        if (topCategory == null || p > maxProb)
                            topCategory = catDist.Key;
                            maxProb     = p;

                    Interlocked.Increment(ref progress);

                // getting the score
                double pre_score     = 0.0;
                double max_sub_score = 0.0;

                foreach (var p in probs)
                    var subScore  = Math.Abs((maxProb - p) / maxProb) * Math.Pow(Math.E, 3); // normalized difference
                    var eSubScore = Math.Exp(-subScore);
                    pre_score += eSubScore;

                    if (eSubScore > max_sub_score)
                        max_sub_score = eSubScore;

                double score = Math.Round(1.0 - (0.5 * (pre_score / probs.Count) + 0.5 * max_sub_score), 2);

                Console.WriteLine("Classified {0} as author {1} with score {2}", textName, topCategory.Name, score);
                resultSet.answers.Add(new Result(textName, topCategory.Name, score));

                Console.WriteLine("writing data to file ...");
                string data = JsonConvert.SerializeObject(resultSet, Formatting.Indented);
                using (StreamWriter sw = new StreamWriter(Path.Combine(outputDir, "answers.json"), false))
Ejemplo n.º 6
        public void LoadAndRunExperiment(string[] args)
            Console.WriteLine("---------------------- Prepare for execution [Reading config]");
            CalculationConstants.SmoothingEpsilon = double.Parse(ConfigurationManager.AppSettings["smoothingEpsilon"]);
            Console.WriteLine("smoothEpsilon:   {0}", CalculationConstants.SmoothingEpsilon);

            var smoothing          = ConfigurationManager.AppSettings["smoothing"];
            ISmoothingTechnique sm = null;

            if (smoothing == "absolute")
                sm = new AbsoluteSmoothing();
                Console.WriteLine("smoothing:       absolute smoothing");
            else if (smoothing == "linear")
                sm = new LinearSmoothing();
                Console.WriteLine("smoothing:       linear smoothing");

            if (ConfigurationManager.AppSettings.AllKeys.Contains("preInitSmoothingParameter"))
                var param = double.Parse(ConfigurationManager.AppSettings["preInitSmoothingParameter"]);
                Console.WriteLine("smoothing:       pre-init with param {0}", param);
                Console.WriteLine("smoothing:       no pre-init, getting parameters from data");

            var processor         = ConfigurationManager.AppSettings["processor"];
            ILanguageProcessor lp = null;

            if (processor == "word")
                lp = new WordLevelProcessor();
                Console.WriteLine("processor:       word level processor");
            else if (processor == "char")
                lp = new CharacterLevelProcessor();
                Console.WriteLine("processor:       character level processor");

            var experiment = ConfigurationManager.AppSettings["experiment"];

            if (experiment == "tira")
                var nGramSize = int.Parse(ConfigurationManager.AppSettings["nGramSize"]);
                Console.WriteLine("n-gram size:     {0}", nGramSize);

                if (args.Any() && args[0] == "test")
                    // this is the test case
                    Console.WriteLine("setting:         test case");
                    Console.WriteLine("---------------------- Starting execution [TIRA Experiment]");

                    new TiraExperiment().Start(ConfigurationManager.AppSettings["tiraTestFolder"], ConfigurationManager.AppSettings["tiraTestFolder"], sm, nGramSize, lp);
                else if (!args.Any())
                    Console.WriteLine("\n\nyou must specify a directory!");
                else if (!Directory.Exists(args[0]))
                    Console.WriteLine("\n\nDirectory {0} not found!", args[0]);
                else if (!Directory.Exists(args[1]))
                    Console.WriteLine("\n\nDirectory {0} not found!", args[1]);
                    Console.WriteLine("setting:         real run on folder: ", Path.GetDirectoryName(args[0]));
                    Console.WriteLine("Starting execution ---------------------- [TIRA Experiment]");

                    new TiraExperiment().Start(args[0], args[1], sm, nGramSize, lp);
            else if (experiment == "expOne")
                Console.WriteLine("---------------------- Starting execution [GREEK Authors]");
                new ExperimentOne().Start(lp, sm);
Ejemplo n.º 7
        public void Start(ILanguageProcessor processor, ISmoothingTechnique smoothing)
            var baseDirectory   = AppDomain.CurrentDomain.BaseDirectory;
            var bayesClassifier = new BayesTextClassifier();

            var docReader = new ReadDocumentFromXmlFile();
            var docPath   = Path.Combine(baseDirectory, "data");
            var authors   = new DirectoryInfo(docPath).GetDirectories();

            var categories = new List <TextSource>();

            // Prepare data
            foreach (var item in authors)
                var docs       = item.GetFiles();
                var dataSource = new List <string>();

                foreach (var doc in docs)
                        dataSource.Add(docReader.ReadDocumentText(doc.FullName, Encoding.GetEncoding(1253), new CultureInfo("el-GR")));
                        Console.WriteLine("Document {0} unreadable", doc.FullName);

                categories.Add(processor.Process(dataSource, item.Name));

            Console.WriteLine("Scanned {1} documents in {0} categories", categories.Count, categories.Select(el => el.Documents.Count).Aggregate((el1, el2) => el1 + el2));

            var testPath    = Path.Combine(baseDirectory, "test");
            var testAuthors = new DirectoryInfo(testPath).GetDirectories();
            var allInOne    = new TextSource();

            allInOne.Documents.AddRange(categories.SelectMany(el => el.Documents));

            // choose n from 1 to 4
            for (int n = 1; n <= 4; n++)
                Console.WriteLine("-----PREPARE for n = {0}", n);
                Console.WriteLine("Building hash tables ..", n);

                Parallel.ForEach(categories, category =>

                allInOne.SetNGramCache(NGramCache.Aggregate(categories.Select(el => el.GetNGramCache())));

                Console.WriteLine("Getting smoothing ready ..");
                smoothing.Init(allInOne, n);
                var categoriesToTest = new Dictionary <TextSource, CategoryProbabilityDistribution>();

                foreach (var cat in categories)
                    categoriesToTest[cat] = new CategoryProbabilityDistribution(cat, smoothing, n);

                int rightClassified = 0;
                int wrongClassified = 0;

                Console.WriteLine("-----Algorithm starts now");
                foreach (var testAuthor in testAuthors)
                    foreach (var testDocument in testAuthor.GetFiles())
                        TextSource topCategory = null;
                        var        maxProb     = 0.0;

                        Parallel.ForEach(categoriesToTest, catDist =>
                            var docText   = new[] { docReader.ReadDocumentText(testDocument.FullName, Encoding.GetEncoding(1253), new CultureInfo("el-GR")) };
                            var docSource = processor.Process(docText, testAuthor.Name).Documents.First();

                            double p = bayesClassifier.P_c(catDist.Value, docSource, n, 1.0 / (double)categories.Count);

                            if (topCategory == null || p > maxProb)
                                topCategory = catDist.Key;
                                maxProb     = p;

                        Console.WriteLine("Classified {0} as author {1} - {2}", testDocument.Name, topCategory.Name, topCategory.Name == testAuthor.Name ? "correct" : "incorrect");

                        if (topCategory.Name == testAuthor.Name)

                Console.WriteLine("Success rate for n={0} is {1}\n", n, (double)rightClassified / ((double)rightClassified + (double)wrongClassified));