/// <summary> /// Downloads the content of a wikipedia article, cleans it and persists both /// the raw and cleaned version of the article in the Data folder. /// </summary> private static void CompareWikiTextAndCleanText(string title) { var page = XmlDumpFileReader.GetPage(title); var wikiMarkupCleaner = new WikiMarkupCleaner(); var pathToDirectory = Utilities.PathToProject + "Data/CleanTextCompare/"; if (!Directory.Exists(pathToDirectory)) { Directory.CreateDirectory(pathToDirectory); } // Write the raw content of the page var rawFilePath = pathToDirectory + "raw.txt"; File.WriteAllText(rawFilePath, page.Text); // Write the cleaned content of the page var cleanedText = wikiMarkupCleaner.CleanArticleContent(page.Text); var cleanedFilePath = pathToDirectory + "cleaned.txt"; File.WriteAllText(cleanedFilePath, cleanedText); Console.WriteLine("Files with '{0}' page content (raw & cleaned) has been written", title); }
/// <summary> /// For each dump files already downloaded on disk, extract the articles' text, /// clean it and save it in a specific text file. /// </summary> private static void ExtractTextFromDumpFiles() { Console.WriteLine("Extraction of text from dump files started"); var wikiMarkupCleaner = new WikiMarkupCleaner(); var relevantFilePaths = Directory.GetFiles(Utilities.PathToDownloadDirectory) .Where(f => Regex.IsMatch(f, "enwiki-latest-pages-meta-current\\d") && Path.GetExtension(f) == ".bz2") .ToList(); Predicate <string> pageFilterer = s => s.Contains(":"); foreach (var relevantFilePath in relevantFilePaths) { Console.WriteLine("Start extracting text from {0}", relevantFilePath); var fileName = Path.GetFileNameWithoutExtension(relevantFilePath); // We extract the articles in the directory with the same name var directoryPath = Utilities.PathToDownloadDirectory + fileName; if (!Directory.Exists(directoryPath)) { Directory.CreateDirectory(directoryPath); } var xmlReader = new XmlDumpFileReader(relevantFilePath); var next = xmlReader.ReadNext(pageFilterer); while (next != null) { try { var filePath = directoryPath + "/" + Utilities.SanitizeFileName(next.Title) + ".txt"; // Cleanup article content var cleanedText = wikiMarkupCleaner.CleanArticleContent(next.Text); if (!string.IsNullOrEmpty(cleanedText)) { File.WriteAllText(filePath, cleanedText); } } catch (Exception ex) { Console.WriteLine("Exception raised on article '{0}': {1}", next.Title, ex.Message); } next = xmlReader.ReadNext(pageFilterer); } Console.WriteLine("Done extraction text from {0}", relevantFilePath); Console.WriteLine("{0} articles extracted", Directory.GetFiles(directoryPath).Count()); Console.WriteLine("--------"); } Console.WriteLine("Extraction of text from dump files done"); Console.WriteLine("========================================"); }