public TextSource Process(IEnumerable<string> documents, string name) { TextSource result = new TextSource(); foreach (var item in documents) { var doc = new DocumentSource(); var sItem = item.Replace(".", " . ") .Replace(",", " , ") .Replace("\r\n", " \r##n ") .Replace("\n", " \n ") .Replace(",", " , ") .Replace("##n", "\n"); // consider punctuation marks/new line as seperate words foreach (var segment in sItem.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)) { doc.LanguageSegments.Add(segment); } result.Documents.Add(doc); } result.Name = name; return result; }
public void Init(TextSource referenceSource, int n) { if (_initPassed) return; _initPassed = true; if (n < 1) { _b = 0; return; } _referenceSource = referenceSource; int n1 = 0; int n2 = 0; IEnumerable<NGramCache> nGrams = referenceSource.GetNGramCache().NextSegment.Values; while (n > 1) { n--; nGrams = nGrams.SelectMany(el => el.NextSegment.Values); } foreach (var item in nGrams) { var frequency = Convert.ToInt32(item.Value); if (frequency == 1) n1++; if (frequency == 2) n2++; } _b = (double)n1 / ((double)n1 + 2 * (double)n2); }
public void Init(TextSource referenceSource, int n) { if (_initPassed) return; _initPassed = true; if (n < 1) { _factor = 1; return; } int n1 = 0; IEnumerable<NGramCache> nGrams = referenceSource.GetNGramCache().NextSegment.Values; while (n > 1) { n--; nGrams = nGrams.SelectMany(el => el.NextSegment.Values); } foreach (var item in nGrams) { var frequency = Convert.ToInt32(item.Value); if (frequency == 1) n1++; } _factor = 1.0 - ((double)n1 / (double)referenceSource.GetAllSegments().Count()); }
public TextSource Process(IEnumerable<string> documents, string name) { TextSource result = new TextSource(); foreach (var item in documents) { var doc = new DocumentSource(); foreach (var segment in item) { doc.LanguageSegments.Add(segment.ToString()); } result.Documents.Add(doc); } result.Name = name; return result; }
public void Start(string mainFolder, string outputDir, ISmoothingTechnique smoothing, int nGramSize, ILanguageProcessor processor) { var resultSet = new ResultSet(); var bayesClassifier = new BayesTextClassifier(); var docReader = new ReadDocumentFromTiraFile(); var categories = new List<TextSource>(); Console.WriteLine("Scanning..."); dynamic jsonConfig; using (StreamReader sr = new StreamReader(Path.Combine(mainFolder, "meta-file.json"))) { jsonConfig = JsonConvert.DeserializeObject(sr.ReadToEnd()); } var unknownFolder = (string)jsonConfig.folder; var encodingString = (string)jsonConfig.encoding; var cultureString = (string)jsonConfig.language; CultureInfo ci = null; switch (cultureString) { case "EN": ci = new CultureInfo("en-US"); break; default: throw new ApplicationException("culture not found"); } Encoding encoding = null; switch (encodingString) { case "UTF8": encoding = Encoding.UTF8; break; case "ASCII": encoding = Encoding.ASCII; break; default: throw new ApplicationException("encoding not found"); } foreach(var item in jsonConfig["candidate-authors"]) { var authorName = (string)item["author-name"]; var docs = new DirectoryInfo(Path.Combine(mainFolder, authorName)).GetFiles(); var dataSource = new List<string>(); foreach (var doc in docs) { try { dataSource.Add(docReader.ReadDocumentText(doc.FullName, encoding, ci)); } catch { Console.WriteLine("Document {0} unreadable", doc.FullName); } } categories.Add(processor.Process(dataSource, authorName)); } int n = nGramSize; Console.WriteLine("Scanned {1} documents in {0} categories", categories.Count, categories.Select(el => el.Documents.Count).Aggregate((el1, el2) => el1 + el2)); var allInOne = new TextSource(); allInOne.Documents.AddRange(categories.SelectMany(el => el.Documents)); Console.WriteLine("Building hash tables ..", n); Parallel.ForEach(categories, category => { for (int i = 1; i <= n; i++) { category.BuildSegmentTable(i); Console.WriteLine("hashed {0} with n={1}", category.Name, i); } }); allInOne.SetNGramCache(NGramCache.Aggregate(categories.Select(el => el.GetNGramCache()))); Console.WriteLine("aggregated hashing"); Console.WriteLine("Getting smoothing ready .."); smoothing.Init(allInOne, nGramSize); var categoriesToTest = new Dictionary<TextSource, CategoryProbabilityDistribution>(); foreach (var cat in categories) { categoriesToTest[cat] = new CategoryProbabilityDistribution(cat, smoothing, n); } Console.WriteLine("Start classifying .."); int totalProgress = jsonConfig["unknown-texts"].Count * categoriesToTest.Count; int progress = 0; System.Timers.Timer t = new System.Timers.Timer(5000); t.Elapsed += (sender, eventArgs) => { Console.Title = "Task is Running. Progress: " + Math.Round((((double)progress / (double)totalProgress) * 100.0), 2).ToString() + "%"; }; t.AutoReset = true; t.Start(); foreach (var item in jsonConfig["unknown-texts"]) { TextSource topCategory = null; var maxProb = 0.0; var textName = (string)item["unknown-text"]; var probs = new List<double>(); Parallel.ForEach(categoriesToTest, catDist => { var docText = new string[] { docReader.ReadDocumentText(Path.Combine(mainFolder, unknownFolder, textName), encoding, ci) }; var docSource = processor.Process(docText, "unknown").Documents.First(); double p = bayesClassifier.P_c(catDist.Value, docSource, n, (double)catDist.Key.Documents.Count / (double)allInOne.Documents.Count); lock (probs) { probs.Add(p); if (topCategory == null || p > maxProb) { topCategory = catDist.Key; maxProb = p; } } Interlocked.Increment(ref progress); }); // getting the score probs.Remove(maxProb); double pre_score = 0.0; double max_sub_score = 0.0; foreach (var p in probs) { var subScore = Math.Abs((maxProb - p) / maxProb) * Math.Pow(Math.E, 3); // normalized difference var eSubScore = Math.Exp(-subScore); pre_score += eSubScore; if (eSubScore > max_sub_score) max_sub_score = eSubScore; } double score = Math.Round(1.0 - (0.5 * (pre_score / probs.Count) + 0.5 * max_sub_score), 2); Console.WriteLine("Classified {0} as author {1} with score {2}", textName, topCategory.Name, score); resultSet.answers.Add(new Result(textName, topCategory.Name, score)); Console.WriteLine("writing data to file ..."); string data = JsonConvert.SerializeObject(resultSet, Formatting.Indented); using (StreamWriter sw = new StreamWriter(Path.Combine(outputDir, "answers.json"), false)) { sw.Write(data); sw.Flush(); } } }
public void Start(ILanguageProcessor processor, ISmoothingTechnique smoothing) { var baseDirectory = AppDomain.CurrentDomain.BaseDirectory; var bayesClassifier = new BayesTextClassifier(); var docReader = new ReadDocumentFromXmlFile(); var docPath = Path.Combine(baseDirectory, "data"); var authors = new DirectoryInfo(docPath).GetDirectories(); var categories = new List<TextSource>(); // Prepare data foreach (var item in authors) { var docs = item.GetFiles(); var dataSource = new List<string>(); foreach (var doc in docs) { try { dataSource.Add(docReader.ReadDocumentText(doc.FullName, Encoding.GetEncoding(1253), new CultureInfo("el-GR"))); } catch { Console.WriteLine("Document {0} unreadable", doc.FullName); } } categories.Add(processor.Process(dataSource, item.Name)); } Console.WriteLine("Scanned {1} documents in {0} categories", categories.Count, categories.Select(el => el.Documents.Count).Aggregate((el1, el2) => el1 + el2)); var testPath = Path.Combine(baseDirectory, "test"); var testAuthors = new DirectoryInfo(testPath).GetDirectories(); var allInOne = new TextSource(); allInOne.Documents.AddRange(categories.SelectMany(el => el.Documents)); // choose n from 1 to 4 for (int n = 1; n <= 4; n++) { Console.WriteLine("-----PREPARE for n = {0}", n); Console.WriteLine("Building hash tables ..", n); Parallel.ForEach(categories, category => { category.BuildSegmentTable(n); }); allInOne.SetNGramCache(NGramCache.Aggregate(categories.Select(el => el.GetNGramCache()))); Console.WriteLine("Getting smoothing ready .."); smoothing.Init(allInOne, n); var categoriesToTest = new Dictionary<TextSource, CategoryProbabilityDistribution>(); foreach(var cat in categories) { categoriesToTest[cat] = new CategoryProbabilityDistribution(cat, smoothing, n); } int rightClassified = 0; int wrongClassified = 0; Console.WriteLine("-----Algorithm starts now"); foreach (var testAuthor in testAuthors) { foreach (var testDocument in testAuthor.GetFiles()) { TextSource topCategory = null; var maxProb = 0.0; Parallel.ForEach(categoriesToTest, catDist => { var docText = new[] { docReader.ReadDocumentText(testDocument.FullName, Encoding.GetEncoding(1253), new CultureInfo("el-GR")) }; var docSource = processor.Process(docText, testAuthor.Name).Documents.First(); double p = bayesClassifier.P_c(catDist.Value, docSource, n, 1.0 / (double)categories.Count); if (topCategory == null || p > maxProb) { topCategory = catDist.Key; maxProb = p; } }); Console.WriteLine("Classified {0} as author {1} - {2}", testDocument.Name, topCategory.Name, topCategory.Name == testAuthor.Name ? "correct" : "incorrect"); if (topCategory.Name == testAuthor.Name) rightClassified++; else wrongClassified++; } } Console.WriteLine("-----SUMMARY"); Console.WriteLine("Success rate for n={0} is {1}\n", n, (double)rightClassified / ((double)rightClassified + (double)wrongClassified)); } }