/// <summary> /// Downloads the content of a wikipedia article, cleans it and persists both /// the raw and cleaned version of the article in the Data folder. /// </summary> private static void CompareWikiTextAndCleanText(string title) { var page = XmlDumpFileReader.GetPage(title); var wikiMarkupCleaner = new WikiMarkupCleaner(); var pathToDirectory = Utilities.PathToProject + "Data/CleanTextCompare/"; if (!Directory.Exists(pathToDirectory)) { Directory.CreateDirectory(pathToDirectory); } // Write the raw content of the page var rawFilePath = pathToDirectory + "raw.txt"; File.WriteAllText(rawFilePath, page.Text); // Write the cleaned content of the page var cleanedText = wikiMarkupCleaner.CleanArticleContent(page.Text); var cleanedFilePath = pathToDirectory + "cleaned.txt"; File.WriteAllText(cleanedFilePath, cleanedText); Console.WriteLine("Files with '{0}' page content (raw & cleaned) has been written", title); }
/// <summary> /// For each dump files already downloaded on disk, extract the articles' text, /// clean it and save it in a specific text file. /// </summary> private static void ExtractTextFromDumpFiles() { Console.WriteLine("Extraction of text from dump files started"); var wikiMarkupCleaner = new WikiMarkupCleaner(); var relevantFilePaths = Directory.GetFiles(Utilities.PathToDownloadDirectory) .Where(f => Regex.IsMatch(f, "enwiki-latest-pages-meta-current\\d") && Path.GetExtension(f) == ".bz2") .ToList(); Predicate <string> pageFilterer = s => s.Contains(":"); foreach (var relevantFilePath in relevantFilePaths) { Console.WriteLine("Start extracting text from {0}", relevantFilePath); var fileName = Path.GetFileNameWithoutExtension(relevantFilePath); // We extract the articles in the directory with the same name var directoryPath = Utilities.PathToDownloadDirectory + fileName; if (!Directory.Exists(directoryPath)) { Directory.CreateDirectory(directoryPath); } var xmlReader = new XmlDumpFileReader(relevantFilePath); var next = xmlReader.ReadNext(pageFilterer); while (next != null) { try { var filePath = directoryPath + "/" + Utilities.SanitizeFileName(next.Title) + ".txt"; // Cleanup article content var cleanedText = wikiMarkupCleaner.CleanArticleContent(next.Text); if (!string.IsNullOrEmpty(cleanedText)) { File.WriteAllText(filePath, cleanedText); } } catch (Exception ex) { Console.WriteLine("Exception raised on article '{0}': {1}", next.Title, ex.Message); } next = xmlReader.ReadNext(pageFilterer); } Console.WriteLine("Done extraction text from {0}", relevantFilePath); Console.WriteLine("{0} articles extracted", Directory.GetFiles(directoryPath).Count()); Console.WriteLine("--------"); } Console.WriteLine("Extraction of text from dump files done"); Console.WriteLine("========================================"); }
/// <summary> /// Extracted from wikipedia dumps (step 1/3). /// Browse the wiki dumps and extract the markdown for each company infobox. /// </summary> private static void ParseCompanyInfoboxesInDumps() { var generalStopwatch = Stopwatch.StartNew(); var dumpDir = Utilities.PathToDownloadDirectory; foreach (var filePath in Directory.EnumerateFiles(dumpDir)) { Console.WriteLine("Start parsing infoboxes in file {0}", Path.GetFileName(filePath)); var stopwatch = Stopwatch.StartNew(); var infoboxes = new List <RawDumpParsedInfobox>(); var wikiReader = new XmlDumpFileReader(filePath); Predicate <string> pageFilterer = s => s.Contains(":"); // Real Wikipedia pages contains ":" (others are conversations etc.) var page = wikiReader.ReadNext(pageFilterer); while (page != null) { var boxes = page.GetInfoboxTexts("company").Select(s => new RawDumpParsedInfobox() { Markdown = HttpUtility.HtmlDecode(s), PageTitle = page.Title }); infoboxes.AddRange(boxes); page = wikiReader.ReadNext(pageFilterer); } stopwatch.Stop(); Console.WriteLine("Parsed {0} infoboxes in {1}", infoboxes.Count, stopwatch.Elapsed.ToString()); stopwatch.Restart(); // Persist infoboxes using (var db = new WikiContext()) { db.RawDumpParsedInfoboxes.AddRange(infoboxes); db.SaveChanges(); } stopwatch.Stop(); Console.WriteLine("Persisted {0} infoboxes in {1}", infoboxes.Count, stopwatch.Elapsed.ToString()); Console.WriteLine("--"); } generalStopwatch.Stop(); Console.WriteLine("Total infobox parsing time: {0}", generalStopwatch.Elapsed.ToString()); }