public TextSource Process(IEnumerable<string> documents, string name)
        {
            TextSource result = new TextSource();

            foreach (var item in documents)
            {
                var doc = new DocumentSource();
                var sItem = item.Replace(".", " . ")
                                .Replace(",", " , ")
                                .Replace("\r\n", " \r##n ")
                                .Replace("\n", " \n ")
                                .Replace(",", " , ")
                                .Replace("##n", "\n");
                // consider punctuation marks/new line as seperate words

                foreach (var segment in sItem.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries))
                {
                    doc.LanguageSegments.Add(segment);
                }

                result.Documents.Add(doc);
            }

            result.Name = name;
            return result;
        }
        public AbsoluteSmoothing(TextSource referenceSource, int n)
        {
            if (n <= 1)
            {
                _b = 0;
                return;
            }

            _referenceSource = referenceSource;

            int n1 = 0;
            int n2 = 0;

            IEnumerable<NGramCache> nGrams = referenceSource.GetNGramCache().NextSegment.Values;

            while (n > 1)
            {
                n--;
                nGrams = nGrams.SelectMany(el => el.NextSegment.Values);
            }

            foreach (var item in nGrams)
            {
                var frequency = Convert.ToInt32(item.Value);
                if (frequency == 1) n1++;
                if (frequency == 2) n2++;
            }

            _b = (double)n1 / ((double)n1 + 2 * (double)n2);
        }
        public TextSource Process(IEnumerable<string> documents, string name)
        {
            TextSource result = new TextSource();

            foreach (var item in documents)
            {
                var doc = new DocumentSource();

                foreach (var segment in item)
                {
                    doc.LanguageSegments.Add(segment.ToString());
                }

                result.Documents.Add(doc);
            }

            result.Name = name;
            return result;
        }
Beispiel #4
0
        public LinearSmoothing(TextSource referenceSource, int n)
        {
            var segments = referenceSource.GetAllSegments();

            int n1 = 0;

            List<IEnumerable<string>> checkedGrams = new List<IEnumerable<string>>();

            foreach (var item in referenceSource.Documents)
            {
                for (int i = 0; i <= item.LanguageSegments.Count - n; i++)
                {
                    IEnumerable<string> ngram = item.LanguageSegments.Skip(i).Take(n).ToArray();
                    if (checkedGrams.Any(el => el.SequenceEqual(ngram))) continue;

                    checkedGrams.Add(ngram);
                    int frequency = referenceSource.FindOccurrences(ngram);

                    if (frequency == 1) n1++;
                }
            }

            factor = 1.0 - ((double)n1 / (double)referenceSource.GetAllSegments().Count());
        }
Beispiel #5
0
        public void Start()
        {
            var bayesClassifier = new BayesTextClassifier();

            var docReader = new ReadDocumentFromXmlFile();
            var docPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "data");
            var authors = new DirectoryInfo(docPath).GetDirectories();

            var categories = new List<TextSource>();
            var processor = new WordLevelProcessor(); //new WordLevelProcessor(); //

            // Prepare data
            foreach (var item in authors)
            {
                var docs = item.GetFiles();
                var dataSource = new List<string>();

                foreach (var doc in docs)
                {
                    try
                    {
                        dataSource.Add(docReader.ReadDocumentText(doc.FullName, Encoding.GetEncoding(1253), new CultureInfo("el-GR")));
                    }
                    catch
                    {
                        Console.WriteLine("Document {0} unreadable", doc.FullName);
                    }
                }

                categories.Add(processor.Process(dataSource, item.Name));
            }

            Console.WriteLine("Scanned {1} documents in {0} categories", categories.Count, categories.Select(el => el.Documents.Count).Aggregate((el1, el2) => el1 + el2));

            var testPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "test");
            var testAuthors = new DirectoryInfo(testPath).GetDirectories();
            var allInOne = new TextSource();
            allInOne.Documents.AddRange(categories.SelectMany(el => el.Documents));

            // choose n from 1 to 4
            for (int n = 1; n <= 4; n++)
            {
                Console.WriteLine("-----PREPARE for n = {0}", n);
                Console.WriteLine("Building hash tables ..", n);

                Parallel.ForEach(categories, category =>
                {
                    category.BuildSegmentTable(n);
                });

                Console.WriteLine("Getting smoothing ready ..");
                var smoothing = new AbsoluteSmoothing(allInOne, n);
                var categoriesToTest = new Dictionary<TextSource, CategoryProbabilityDistribution>();

                foreach(var cat in categories)
                {
                    categoriesToTest[cat] = new CategoryProbabilityDistribution(cat, smoothing, n);
                }

                int rightClassified = 0;
                int wrongClassified = 0;

                Console.WriteLine("-----Algorithm starts now");
                foreach (var testAuthor in testAuthors)
                {
                    //foreach (var testDocument in testAuthor.GetFiles())
                    Parallel.ForEach(testAuthor.GetFiles(), testDocument =>
                    {
                        TextSource topCategory = null;
                        var maxProb = 0.0;

                        foreach (var catDist in categoriesToTest)
                        {
                            var docText = new[] { docReader.ReadDocumentText(testDocument.FullName, Encoding.GetEncoding(1253), new CultureInfo("el-GR")) };
                            var docSource = processor.Process(docText, testAuthor.Name).Documents.First();

                            double p = bayesClassifier.P_c(catDist.Value, docSource, n, 1.0 / (double)categories.Count);

                            if (topCategory == null || p > maxProb)
                            {
                                topCategory = catDist.Key;
                                maxProb = p;
                            }
                        }

                        Console.WriteLine("Classified {0} as author {1} - {2}", testDocument.Name, topCategory.Name, topCategory.Name == testAuthor.Name ? "correct" : "incorrect");

                        if (topCategory.Name == testAuthor.Name) rightClassified++;
                        else wrongClassified++;
                    });
                    //}
                }

                Console.WriteLine("-----SUMMARY");
                Console.WriteLine("Success rate for n={0} is {1}\n", n, (double)rightClassified / ((double)rightClassified + (double)wrongClassified));
            }
        }
 public CategoryProbabilityDistribution(TextSource referenceSource, ISmoothingTechnique smoothing, int n)
 {
     _smoothing = smoothing;
     _referenceSource = referenceSource;
 }
Beispiel #7
0
        public void Start(string mainFolder)
        {
            var resultSet = new ResultSet();
            var bayesClassifier = new BayesTextClassifier();

            var docReader = new ReadDocumentFromTiraFile();

            var categories = new List<TextSource>();
            var processor = new WordLevelProcessor();

            Console.WriteLine("Scanning...");
            dynamic jsonConfig;

            using (StreamReader sr = new StreamReader(Path.Combine(mainFolder, "meta-file.json")))
            {
                jsonConfig = JsonConvert.DeserializeObject(sr.ReadToEnd());
            }

            var unknownFolder = (string)jsonConfig.folder;
            var encodingString = (string)jsonConfig.encoding;
            var cultureString = (string)jsonConfig.language;

            CultureInfo ci = null;
            switch (cultureString)
            {
                case "EN":
                    ci = new CultureInfo("en-US");
                    break;
                default:
                    throw new ApplicationException("culture not found");
            }

            Encoding encoding = null;
            switch (encodingString)
            {
                case "UTF8":
                    encoding = Encoding.UTF8;
                    break;
                case "ASCII":
                    encoding = Encoding.ASCII;
                    break;
                default:
                    throw new ApplicationException("encoding not found");
            }

            foreach(var item in jsonConfig["candidate-authors"])
            {
                var authorName = (string)item["author-name"];
                var docs = new DirectoryInfo(Path.Combine(mainFolder, authorName)).GetFiles();
                var dataSource = new List<string>();

                foreach (var doc in docs)
                {
                    try
                    {
                        dataSource.Add(docReader.ReadDocumentText(doc.FullName, encoding, ci));
                    }
                    catch
                    {
                        Console.WriteLine("Document {0} unreadable", doc.FullName);
                    }
                }

                categories.Add(processor.Process(dataSource, authorName));
            }

            int n = 3; // choose n=3

            Console.WriteLine("Scanned {1} documents in {0} categories", categories.Count, categories.Select(el => el.Documents.Count).Aggregate((el1, el2) => el1 + el2));

            var allInOne = new TextSource();
            allInOne.Documents.AddRange(categories.SelectMany(el => el.Documents));

            Console.WriteLine("Building hash tables ..", n);

            Parallel.ForEach(categories, category =>
            {
                for (int i = 1; i <= n; i++)
                {
                    category.BuildSegmentTable(i);
                    Console.WriteLine("hashed {0} with n={1}", category.Name, i);
                }
            });

            allInOne.SetNGramCache(NGramCache.Aggregate(categories.Select(el => el.GetNGramCache())));
            Console.WriteLine("aggregated hashing");

            Console.WriteLine("Getting smoothing ready ..");
            var smoothing = new AbsoluteSmoothing(allInOne, n);
            var categoriesToTest = new Dictionary<TextSource, CategoryProbabilityDistribution>();

            foreach (var cat in categories)
            {
                categoriesToTest[cat] = new CategoryProbabilityDistribution(cat, smoothing, n);
            }

            Console.WriteLine("Start classifying ..");
            int totalProgress = jsonConfig["unknown-texts"].Count * categoriesToTest.Count;
            int progress = 0;

            foreach (var item in jsonConfig["unknown-texts"])
            {
                TextSource topCategory = null;
                var maxProb = 0.0;
                var textName = (string)item["unknown-text"];
                var probs = new List<double>();

                System.Timers.Timer t = new System.Timers.Timer(5000);
                t.Elapsed += (sender, eventArgs) =>
                {
                    Console.Title = "Task is Running. Progress: " + Math.Round((((double)progress / (double)totalProgress) * 100.0), 2).ToString();
                };
                t.AutoReset = true;
                t.Start();

                Parallel.ForEach(categoriesToTest, catDist =>
                {
                    var docText = new string[] { docReader.ReadDocumentText(Path.Combine(mainFolder, unknownFolder, textName), encoding, ci) };
                    var docSource = processor.Process(docText, "unknown").Documents.First();

                    double p = bayesClassifier.P_c(catDist.Value, docSource, n, 1.0 / categories.Count);
                    probs.Add(p);

                    if (topCategory == null || p > maxProb)
                    {
                        topCategory = catDist.Key;
                        maxProb = p;
                    }

                    Interlocked.Increment(ref progress);
                });

                // getting the score
                probs.Remove(maxProb);
                double pre_score = 0.0;
                double max_sub_score = 0.0;

                foreach (var p in probs)
                {
                    var subScore = Math.Abs((maxProb - p) / maxProb) * Math.Pow(Math.E, 3); // normalized difference
                    var eSubScore = Math.Exp(-subScore);
                    pre_score += eSubScore;

                    if (eSubScore > max_sub_score)
                        max_sub_score = eSubScore;
                }

                double score = Math.Round(1.0 - (0.5 * (pre_score / probs.Count) + 0.5 * max_sub_score), 2);

                Console.WriteLine("Classified {0} as author {1} with score {2}", textName, topCategory.Name, score);
                resultSet.answers.Add(new Result(textName, topCategory.Name, score));

                Console.WriteLine("writing data to file ...");
                string data = JsonConvert.SerializeObject(resultSet, Formatting.Indented);
                using (StreamWriter sw = new StreamWriter(Path.Combine(mainFolder, "results.json"), false))
                {
                    sw.Write(data);
                    sw.Flush();
                }
            }
        }