Ejemplo n.º 1
0
        /// <summary>
        /// Downloads the content of a wikipedia article, cleans it and persists both
        /// the raw and cleaned version of the article in the Data folder.
        /// </summary>
        private static void CompareWikiTextAndCleanText(string title)
        {
            var page = XmlDumpFileReader.GetPage(title);
            var wikiMarkupCleaner = new WikiMarkupCleaner();

            var pathToDirectory = Utilities.PathToProject + "Data/CleanTextCompare/";

            if (!Directory.Exists(pathToDirectory))
            {
                Directory.CreateDirectory(pathToDirectory);
            }

            // Write the raw content of the page
            var rawFilePath = pathToDirectory + "raw.txt";

            File.WriteAllText(rawFilePath, page.Text);

            // Write the cleaned content of the page
            var cleanedText     = wikiMarkupCleaner.CleanArticleContent(page.Text);
            var cleanedFilePath = pathToDirectory + "cleaned.txt";

            File.WriteAllText(cleanedFilePath, cleanedText);

            Console.WriteLine("Files with '{0}' page content (raw & cleaned) has been written", title);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// For each dump files already downloaded on disk, extract the articles' text,
        /// clean it and save it in a specific text file.
        /// </summary>
        private static void ExtractTextFromDumpFiles()
        {
            Console.WriteLine("Extraction of text from dump files started");
            var wikiMarkupCleaner = new WikiMarkupCleaner();

            var relevantFilePaths = Directory.GetFiles(Utilities.PathToDownloadDirectory)
                                    .Where(f => Regex.IsMatch(f, "enwiki-latest-pages-meta-current\\d") && Path.GetExtension(f) == ".bz2")
                                    .ToList();
            Predicate <string> pageFilterer = s => s.Contains(":");

            foreach (var relevantFilePath in relevantFilePaths)
            {
                Console.WriteLine("Start extracting text from {0}", relevantFilePath);

                var fileName = Path.GetFileNameWithoutExtension(relevantFilePath);

                // We extract the articles in the directory with the same name
                var directoryPath = Utilities.PathToDownloadDirectory + fileName;
                if (!Directory.Exists(directoryPath))
                {
                    Directory.CreateDirectory(directoryPath);
                }

                var xmlReader = new XmlDumpFileReader(relevantFilePath);
                var next      = xmlReader.ReadNext(pageFilterer);
                while (next != null)
                {
                    try
                    {
                        var filePath = directoryPath + "/" + Utilities.SanitizeFileName(next.Title) + ".txt";
                        // Cleanup article content
                        var cleanedText = wikiMarkupCleaner.CleanArticleContent(next.Text);

                        if (!string.IsNullOrEmpty(cleanedText))
                        {
                            File.WriteAllText(filePath, cleanedText);
                        }
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine("Exception raised on article '{0}': {1}", next.Title, ex.Message);
                    }

                    next = xmlReader.ReadNext(pageFilterer);
                }

                Console.WriteLine("Done extraction text from {0}", relevantFilePath);
                Console.WriteLine("{0} articles extracted", Directory.GetFiles(directoryPath).Count());
                Console.WriteLine("--------");
            }
            Console.WriteLine("Extraction of text from dump files done");
            Console.WriteLine("========================================");
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Extracted from wikipedia dumps (step 1/3).
        /// Browse the wiki dumps and extract the markdown for each company infobox.
        /// </summary>
        private static void ParseCompanyInfoboxesInDumps()
        {
            var generalStopwatch = Stopwatch.StartNew();

            var dumpDir = Utilities.PathToDownloadDirectory;

            foreach (var filePath in Directory.EnumerateFiles(dumpDir))
            {
                Console.WriteLine("Start parsing infoboxes in file {0}", Path.GetFileName(filePath));
                var stopwatch = Stopwatch.StartNew();

                var infoboxes = new List <RawDumpParsedInfobox>();

                var wikiReader = new XmlDumpFileReader(filePath);
                Predicate <string> pageFilterer = s => s.Contains(":"); // Real Wikipedia pages contains ":" (others are conversations etc.)
                var page = wikiReader.ReadNext(pageFilterer);
                while (page != null)
                {
                    var boxes = page.GetInfoboxTexts("company").Select(s => new RawDumpParsedInfobox()
                    {
                        Markdown  = HttpUtility.HtmlDecode(s),
                        PageTitle = page.Title
                    });
                    infoboxes.AddRange(boxes);

                    page = wikiReader.ReadNext(pageFilterer);
                }

                stopwatch.Stop();
                Console.WriteLine("Parsed {0} infoboxes in {1}", infoboxes.Count, stopwatch.Elapsed.ToString());
                stopwatch.Restart();

                // Persist infoboxes
                using (var db = new WikiContext())
                {
                    db.RawDumpParsedInfoboxes.AddRange(infoboxes);
                    db.SaveChanges();
                }

                stopwatch.Stop();
                Console.WriteLine("Persisted {0} infoboxes in {1}", infoboxes.Count, stopwatch.Elapsed.ToString());
                Console.WriteLine("--");
            }


            generalStopwatch.Stop();
            Console.WriteLine("Total infobox parsing time: {0}", generalStopwatch.Elapsed.ToString());
        }