コード例 #1
0
        /// <summary>
        /// Creates a translation dictionary between <paramref name="srcLanguage"/> and <paramref name="tgtLanguage"/> by:
        /// - downloading the appropriate files
        /// - extracting information from them
        /// - associating the info of the different files
        /// </summary>
        /// <param name="srcLanguage">The ISO-639-1 code for the source language</param>
        /// <param name="tgtLanguage">The ISO-639-1 code for the target language</param>
        /// <param name="wikimedia">The wikipedia resource (wikipedia, witionary...)</param>
        /// <returns>The collection of translated entities</returns>
        public string CreateDictionary(string srcLanguage, string tgtLanguage, Wikimedia wikimedia, DumpDownloader dumpDownloader)
        {
            Console.WriteLine("Start creating dictionary {0}-{1}", srcLanguage, tgtLanguage);

            // Creates the output file name
            var outputFilePath = PathToProject + string.Format("Output\\{0}-{1}-{2}-dictionary.txt",
                                                               srcLanguage, tgtLanguage, Enum.GetName(typeof(Wikimedia), wikimedia));

            TranslatedEntities translatedEntities;

            if (wikimedia == Wikimedia.Wikipedia)
            {
                // Creates the dictionary from the language links between the pages
                translatedEntities = ExtractTranslatedEntitiesFromLanguageLinks(srcLanguage, tgtLanguage, wikimedia, dumpDownloader);
            }
            else if (wikimedia == Wikimedia.Wiktionary)
            {
                // Creates the dictionary from the content of wiktionary pages (a translation section is sometimes present on wiktionary)
                translatedEntities = ExtractTranslatedEntitiesFromWiktionaryContent(srcLanguage, tgtLanguage, wikimedia, dumpDownloader);
            }
            else
            {
                throw new ArgumentException(string.Format("Wikimedia '{0}' is not supported for building translation dictionaries", Enum.GetName(typeof(Wikimedia), wikimedia)));
            }

            // Write all translated entities
            if (File.Exists(outputFilePath))
            {
                File.Delete(outputFilePath);
            }
            File.AppendAllLines(outputFilePath, translatedEntities.GetTextFileLines());

            Console.WriteLine("Finished creating dictionary {0}-{1}", srcLanguage, tgtLanguage);
            Console.WriteLine("----");
            return(outputFilePath);
        }
コード例 #2
0
        /// <summary>
        /// Creates a translation dictionary from languagelinks for a wikimedia resource. Language links are a specific kind of
        /// interwikilinks (ie links between two different domains - en.wikipedia.org and fr.wikipedia.org for instance).
        /// See https://en.wiktionary.org/wiki/Help:FAQ#What_are_interwiki_links.3F for more details.
        /// </summary>
        /// <param name="srcLanguage">The ISO-639-1 code for the source language</param>
        /// <param name="tgtLanguage">The ISO-639-1 code for the target language</param>
        /// <param name="wikimedia">The wikipedia resource (wikipedia, witionary...)</param>
        /// <returns>The collection of translated entities</returns>
        private TranslatedEntities ExtractTranslatedEntitiesFromLanguageLinks(string srcLanguage, string tgtLanguage, Wikimedia wikimedia, DumpDownloader dumpDownloader)
        {
            var translatedEntities = new TranslatedEntities()
            {
                SrcLanguage = srcLanguage,
                TgtLanguage = tgtLanguage,
                Entities    = new List <TranslatedEntity>()
            };

            // Download dump files with pages and langlinks of the source language (always take the latest version)
            var pageDumpFileName      = string.Format("{0}{1}-latest-page.sql.gz", srcLanguage, GetWikimediaExtension(wikimedia));
            var srcPagePropsFilePath  = dumpDownloader.DownloadFile(pageDumpFileName);
            var langLinksDumpFileName = string.Format("{0}{1}-latest-langlinks.sql.gz", srcLanguage, GetWikimediaExtension(wikimedia));
            var srcLangLinksFilePath  = dumpDownloader.DownloadFile(langLinksDumpFileName);

            // Parse language links and load them in dictionary for fast retrieval
            Console.WriteLine("Start parsing language links");
            var parser        = new DumpParser();
            var languageLinks = parser.ParseLanguageLinks(srcLangLinksFilePath, tgtLanguage)
                                .ToDictionary(ll => ll.Id, ll => ll);

            Console.WriteLine("{0} language links found", languageLinks.Count());

            // Associate the pages (with title in src language) with the language links (with title in tgt language)
            Console.WriteLine("Start associating pages and language links");
            var counter        = 0;
            var pageInfoReader = new SqlDumpFileReader(srcPagePropsFilePath);
            var pageInfo       = pageInfoReader.ReadNext();

            while (pageInfo != null)
            {
                LanguageLink languageLink;
                if (languageLinks.TryGetValue(pageInfo.Id, out languageLink))
                {
                    counter++;
                    translatedEntities.Entities.Add(new TranslatedEntity()
                    {
                        SrcName = pageInfo.GetDisplayedTitle(),
                        TgtName = languageLink.GetDisplayedTitle()
                    });
                }

                pageInfo = pageInfoReader.ReadNext();
            }
            Console.WriteLine("Associated {0} entries for {1}-{2}", counter, srcLanguage, tgtLanguage);

            return(translatedEntities);
        }
コード例 #3
0
        private TranslatedEntities ExtractTranslatedEntitiesFromWiktionaryContent(string srcLanguage, string tgtLanguage,
                                                                                  Wikimedia wikimedia, DumpDownloader dumpDownloader)
        {
            // Download dump files with pages and langlinks of the source language (always take the latest version)
            var pagesDumpFileName = string.Format("{0}{1}-latest-pages-meta-current.xml.bz2", srcLanguage, GetWikimediaExtension(wikimedia));
            var srcPageFilePath   = dumpDownloader.DownloadFile(pagesDumpFileName);

            var parser             = new WiktionaryDumpParser.Src.WiktionaryDumpParser();
            var translatedEntities = parser.ExtractTranslatedEntities(srcPageFilePath, srcLanguage, tgtLanguage);

            return(translatedEntities);
        }