Example #1
0
 public static void ParseToJson()
 {
     Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary).Where(rf => rf.pages >= wiktPageNumLimit), new ParallelOptions {
         MaxDegreeOfParallelism = 6
     }, rf => {
         //var rf = WikiRawConsts.loadStat().First(f => f.lang == "cs" && f.type == WikiRawConsts.wiktionary);
         IEnumerable <WikimediaPage> pages = new Wikimedia(rf.fileName()).Articles.Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage);
         var cnt = 0;
         JsonNew.SerializeEnum <WikimediaPage>(rf.fileNameDump() + ".parsed.json",
                                               pages.Where(p => p.Sections.FirstOrDefault(s => rf.lang != "cs" || s.SectionName.Trim().ToLower() == "čeština") != null).identityEnum(page => {
             if (cnt % 10000 == 0)
             {
                 Console.WriteLine($"{rf.lang} {cnt}");
             }
             cnt++;
             page.Text = "";
         })
                                               );
         //using (var wr = new JsonStreamWriter(rf.fileNameDump() + ".parsed.json"))
         //  foreach (var page in pages.Where(p => p.Sections.FirstOrDefault(s => rf.lang != "cs" || s.SectionName.Trim().ToLower() == "čeština") != null)) {
         //    if (cnt % 10000 == 0) Console.WriteLine($"{rf.lang} {cnt}");
         //    cnt++;
         //    page.Text = "";
         //    wr.Serialize(page);
         //  }
     });
 }
Example #2
0
    public static void ExtractSections()
    {
        var stat = WikiRawConsts.loadStat();

        Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary).Where(rf => rf.pages >= wiktPageNumLimit), new ParallelOptions {
            MaxDegreeOfParallelism = 6
        }, rf => {
            IEnumerable <WikimediaPage> pages = new Wikimedia(rf.fileName()).Articles.Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage);
            var cnt = 0;
            JsonNew.SerializeEnum <Sections>(rf.fileNameDump() + ".sec.json", pages.Select(p => new Sections(p)).identityEnum(sect => {
                if (cnt % 100000 == 0)
                {
                    Console.WriteLine($"{rf.lang} {cnt}");
                }
                cnt++;
            }));
            //using (var wr = new JsonStreamWriter(rf.fileNameDump() + ".sec.json"))
            //  foreach (var sect in pages.Select(p => new Sections(p))) {
            //    if (cnt % 100000 == 0) Console.WriteLine($"{rf.lang} {cnt}");
            //    cnt++;
            //    wr.Serialize(sect);
            //  }
            lock (stat) {
                stat.First(s => s.type == WikiRawConsts.wiktionary && s.lang == rf.lang).pages = cnt;
            }
        });
        WikiRawConsts.saveStat();
    }
Example #3
0
        private static void GetMotorcycleArticles()
        {
            var wm = new Wikimedia("enwiki-20160701-pages-articles-multistream.xml");

            var articles = wm.Articles
                           .Where(article => article.Text.Contains("{{Infobox Motorcycle"));

            Wikimedia.WriteToDisk(articles, "motorcycles.dat");
        }
Example #4
0
        private static void Main(string[] args)
        {
            if (!ParseArgs(args))
            {
                Console.WriteLine("NGrams.exe [-in input.(xml|dat)] [-db frequencies.db] [-out frequencies.txt]");
                Console.WriteLine("At least two of the input files must be present:");
                Console.WriteLine("   -in specifies the Wikipedia plaintext dump location");
                Console.WriteLine("   -db specifies the location of processed n-gram frequencies.");
                Console.WriteLine("        When used with -in, the input file will be processed into -db");
                Console.WriteLine("        When used without -in, assume frequencies exist and read from this db");
                Console.WriteLine("   -out specifies the plaintext TSV that should contain the database dump");
                return;
            }

            if (string.IsNullOrEmpty(inputFile) && !string.IsNullOrEmpty(dbFile))
            {
                // process existing DB into frequency counts
                DumpFrequenciesToDisk(WordFrequency.GetNGramFrequencies(dbFile, cutoff), outputFile);
            }
            else
            {
                var wm       = new Wikimedia(inputFile);
                var articles = wm.Articles
                               .Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage);

                if (articleLimit > 0)
                {
                    articles = articles.Take(articleLimit);
                }

                if (string.IsNullOrEmpty(dbFile))
                {
                    // don't care about the db; just use a temp file
                    dbFile = Path.GetTempFileName();
                }

                Console.WriteLine("Beginning n-gram calculation");
                var startTime = DateTime.Now;
                WordFrequency.CalculateNGramFrequencies(articles, dbFile, nGramSize);
                var endTime = DateTime.Now;

                Console.WriteLine("Calculation took " + (endTime - startTime));

                if (!string.IsNullOrEmpty(outputFile))
                {
                    Console.WriteLine("Writing frequencies to disk");

                    startTime = DateTime.Now;
                    DumpFrequenciesToDisk(WordFrequency.GetNGramFrequencies(dbFile, cutoff), outputFile);
                    endTime = DateTime.Now;

                    Console.WriteLine("Dump to disk took " + (endTime - startTime));
                }
            }
        }
        private TranslatedEntities ExtractTranslatedEntitiesFromWiktionaryContent(string srcLanguage, string tgtLanguage,
                                                                                  Wikimedia wikimedia, DumpDownloader dumpDownloader)
        {
            // Download dump files with pages and langlinks of the source language (always take the latest version)
            var pagesDumpFileName = string.Format("{0}{1}-latest-pages-meta-current.xml.bz2", srcLanguage, GetWikimediaExtension(wikimedia));
            var srcPageFilePath   = dumpDownloader.DownloadFile(pagesDumpFileName);

            var parser             = new WiktionaryDumpParser.Src.WiktionaryDumpParser();
            var translatedEntities = parser.ExtractTranslatedEntities(srcPageFilePath, srcLanguage, tgtLanguage);

            return(translatedEntities);
        }
        private string GetWikimediaExtension(Wikimedia wikimedia)
        {
            switch (wikimedia)
            {
            case Wikimedia.Wikipedia:
                return("wiki");

            case Wikimedia.Wiktionary:
                return("wiktionary");

            default:
                throw new ArgumentException("Wikimedia '{0}' is not supported", Enum.GetName(typeof(Wikimedia), wikimedia));
            }
        }
Example #7
0
        static void Main(string[] args)
        {
            if (!ParseArgs(args))
            {
                Usage();
                return;
            }

            var startTime = DateTime.Now;

            var wm = new Wikimedia(inputFile);

            IEnumerable <WikimediaPage> articles = wm.Articles
                                                   .Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage);

            if (!string.IsNullOrEmpty(title))
            {
                var targetArticle =
                    articles.FirstOrDefault(
                        article => article.Title.Equals(title, StringComparison.CurrentCultureIgnoreCase));

                if (targetArticle == null)
                {
                    Console.WriteLine("Could not find article '{0}'", title);
                    return;
                }

                File.WriteAllText(outputFile, rawText ? targetArticle.Text : targetArticle.Plaintext);
            }
            else
            {
                if (articleLimit > 0)
                {
                    articles = articles.Take(articleLimit);
                }

                var numberOfArticles = Wikimedia.WriteToDisk(articles, outputFile);
                Console.WriteLine("Wrote {0} articles to disk.", numberOfArticles);
            }

            var endTime = DateTime.Now;

            TimeSpan processTime = endTime - startTime;

            Console.WriteLine("Process took " + processTime);
        }
        /// <summary>
        /// Creates a translation dictionary between <paramref name="srcLanguage"/> and <paramref name="tgtLanguage"/> by:
        /// - downloading the appropriate files
        /// - extracting information from them
        /// - associating the info of the different files
        /// </summary>
        /// <param name="srcLanguage">The ISO-639-1 code for the source language</param>
        /// <param name="tgtLanguage">The ISO-639-1 code for the target language</param>
        /// <param name="wikimedia">The wikipedia resource (wikipedia, witionary...)</param>
        /// <returns>The collection of translated entities</returns>
        public string CreateDictionary(string srcLanguage, string tgtLanguage, Wikimedia wikimedia, DumpDownloader dumpDownloader)
        {
            Console.WriteLine("Start creating dictionary {0}-{1}", srcLanguage, tgtLanguage);

            // Creates the output file name
            var outputFilePath = PathToProject + string.Format("Output\\{0}-{1}-{2}-dictionary.txt",
                                                               srcLanguage, tgtLanguage, Enum.GetName(typeof(Wikimedia), wikimedia));

            TranslatedEntities translatedEntities;

            if (wikimedia == Wikimedia.Wikipedia)
            {
                // Creates the dictionary from the language links between the pages
                translatedEntities = ExtractTranslatedEntitiesFromLanguageLinks(srcLanguage, tgtLanguage, wikimedia, dumpDownloader);
            }
            else if (wikimedia == Wikimedia.Wiktionary)
            {
                // Creates the dictionary from the content of wiktionary pages (a translation section is sometimes present on wiktionary)
                translatedEntities = ExtractTranslatedEntitiesFromWiktionaryContent(srcLanguage, tgtLanguage, wikimedia, dumpDownloader);
            }
            else
            {
                throw new ArgumentException(string.Format("Wikimedia '{0}' is not supported for building translation dictionaries", Enum.GetName(typeof(Wikimedia), wikimedia)));
            }

            // Write all translated entities
            if (File.Exists(outputFilePath))
            {
                File.Delete(outputFilePath);
            }
            File.AppendAllLines(outputFilePath, translatedEntities.GetTextFileLines());

            Console.WriteLine("Finished creating dictionary {0}-{1}", srcLanguage, tgtLanguage);
            Console.WriteLine("----");
            return(outputFilePath);
        }
        /// <summary>
        /// Creates a translation dictionary from languagelinks for a wikimedia resource. Language links are a specific kind of
        /// interwikilinks (ie links between two different domains - en.wikipedia.org and fr.wikipedia.org for instance).
        /// See https://en.wiktionary.org/wiki/Help:FAQ#What_are_interwiki_links.3F for more details.
        /// </summary>
        /// <param name="srcLanguage">The ISO-639-1 code for the source language</param>
        /// <param name="tgtLanguage">The ISO-639-1 code for the target language</param>
        /// <param name="wikimedia">The wikipedia resource (wikipedia, witionary...)</param>
        /// <returns>The collection of translated entities</returns>
        private TranslatedEntities ExtractTranslatedEntitiesFromLanguageLinks(string srcLanguage, string tgtLanguage, Wikimedia wikimedia, DumpDownloader dumpDownloader)
        {
            var translatedEntities = new TranslatedEntities()
            {
                SrcLanguage = srcLanguage,
                TgtLanguage = tgtLanguage,
                Entities    = new List <TranslatedEntity>()
            };

            // Download dump files with pages and langlinks of the source language (always take the latest version)
            var pageDumpFileName      = string.Format("{0}{1}-latest-page.sql.gz", srcLanguage, GetWikimediaExtension(wikimedia));
            var srcPagePropsFilePath  = dumpDownloader.DownloadFile(pageDumpFileName);
            var langLinksDumpFileName = string.Format("{0}{1}-latest-langlinks.sql.gz", srcLanguage, GetWikimediaExtension(wikimedia));
            var srcLangLinksFilePath  = dumpDownloader.DownloadFile(langLinksDumpFileName);

            // Parse language links and load them in dictionary for fast retrieval
            Console.WriteLine("Start parsing language links");
            var parser        = new DumpParser();
            var languageLinks = parser.ParseLanguageLinks(srcLangLinksFilePath, tgtLanguage)
                                .ToDictionary(ll => ll.Id, ll => ll);

            Console.WriteLine("{0} language links found", languageLinks.Count());

            // Associate the pages (with title in src language) with the language links (with title in tgt language)
            Console.WriteLine("Start associating pages and language links");
            var counter        = 0;
            var pageInfoReader = new SqlDumpFileReader(srcPagePropsFilePath);
            var pageInfo       = pageInfoReader.ReadNext();

            while (pageInfo != null)
            {
                LanguageLink languageLink;
                if (languageLinks.TryGetValue(pageInfo.Id, out languageLink))
                {
                    counter++;
                    translatedEntities.Entities.Add(new TranslatedEntity()
                    {
                        SrcName = pageInfo.GetDisplayedTitle(),
                        TgtName = languageLink.GetDisplayedTitle()
                    });
                }

                pageInfo = pageInfoReader.ReadNext();
            }
            Console.WriteLine("Associated {0} entries for {1}-{2}", counter, srcLanguage, tgtLanguage);

            return(translatedEntities);
        }