public static void ParseToJson() { Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary).Where(rf => rf.pages >= wiktPageNumLimit), new ParallelOptions { MaxDegreeOfParallelism = 6 }, rf => { //var rf = WikiRawConsts.loadStat().First(f => f.lang == "cs" && f.type == WikiRawConsts.wiktionary); IEnumerable <WikimediaPage> pages = new Wikimedia(rf.fileName()).Articles.Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage); var cnt = 0; JsonNew.SerializeEnum <WikimediaPage>(rf.fileNameDump() + ".parsed.json", pages.Where(p => p.Sections.FirstOrDefault(s => rf.lang != "cs" || s.SectionName.Trim().ToLower() == "čeština") != null).identityEnum(page => { if (cnt % 10000 == 0) { Console.WriteLine($"{rf.lang} {cnt}"); } cnt++; page.Text = ""; }) ); //using (var wr = new JsonStreamWriter(rf.fileNameDump() + ".parsed.json")) // foreach (var page in pages.Where(p => p.Sections.FirstOrDefault(s => rf.lang != "cs" || s.SectionName.Trim().ToLower() == "čeština") != null)) { // if (cnt % 10000 == 0) Console.WriteLine($"{rf.lang} {cnt}"); // cnt++; // page.Text = ""; // wr.Serialize(page); // } }); }
public static void ExtractSections() { var stat = WikiRawConsts.loadStat(); Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary).Where(rf => rf.pages >= wiktPageNumLimit), new ParallelOptions { MaxDegreeOfParallelism = 6 }, rf => { IEnumerable <WikimediaPage> pages = new Wikimedia(rf.fileName()).Articles.Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage); var cnt = 0; JsonNew.SerializeEnum <Sections>(rf.fileNameDump() + ".sec.json", pages.Select(p => new Sections(p)).identityEnum(sect => { if (cnt % 100000 == 0) { Console.WriteLine($"{rf.lang} {cnt}"); } cnt++; })); //using (var wr = new JsonStreamWriter(rf.fileNameDump() + ".sec.json")) // foreach (var sect in pages.Select(p => new Sections(p))) { // if (cnt % 100000 == 0) Console.WriteLine($"{rf.lang} {cnt}"); // cnt++; // wr.Serialize(sect); // } lock (stat) { stat.First(s => s.type == WikiRawConsts.wiktionary && s.lang == rf.lang).pages = cnt; } }); WikiRawConsts.saveStat(); }
private static void GetMotorcycleArticles() { var wm = new Wikimedia("enwiki-20160701-pages-articles-multistream.xml"); var articles = wm.Articles .Where(article => article.Text.Contains("{{Infobox Motorcycle")); Wikimedia.WriteToDisk(articles, "motorcycles.dat"); }
private static void Main(string[] args) { if (!ParseArgs(args)) { Console.WriteLine("NGrams.exe [-in input.(xml|dat)] [-db frequencies.db] [-out frequencies.txt]"); Console.WriteLine("At least two of the input files must be present:"); Console.WriteLine(" -in specifies the Wikipedia plaintext dump location"); Console.WriteLine(" -db specifies the location of processed n-gram frequencies."); Console.WriteLine(" When used with -in, the input file will be processed into -db"); Console.WriteLine(" When used without -in, assume frequencies exist and read from this db"); Console.WriteLine(" -out specifies the plaintext TSV that should contain the database dump"); return; } if (string.IsNullOrEmpty(inputFile) && !string.IsNullOrEmpty(dbFile)) { // process existing DB into frequency counts DumpFrequenciesToDisk(WordFrequency.GetNGramFrequencies(dbFile, cutoff), outputFile); } else { var wm = new Wikimedia(inputFile); var articles = wm.Articles .Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage); if (articleLimit > 0) { articles = articles.Take(articleLimit); } if (string.IsNullOrEmpty(dbFile)) { // don't care about the db; just use a temp file dbFile = Path.GetTempFileName(); } Console.WriteLine("Beginning n-gram calculation"); var startTime = DateTime.Now; WordFrequency.CalculateNGramFrequencies(articles, dbFile, nGramSize); var endTime = DateTime.Now; Console.WriteLine("Calculation took " + (endTime - startTime)); if (!string.IsNullOrEmpty(outputFile)) { Console.WriteLine("Writing frequencies to disk"); startTime = DateTime.Now; DumpFrequenciesToDisk(WordFrequency.GetNGramFrequencies(dbFile, cutoff), outputFile); endTime = DateTime.Now; Console.WriteLine("Dump to disk took " + (endTime - startTime)); } } }
private TranslatedEntities ExtractTranslatedEntitiesFromWiktionaryContent(string srcLanguage, string tgtLanguage, Wikimedia wikimedia, DumpDownloader dumpDownloader) { // Download dump files with pages and langlinks of the source language (always take the latest version) var pagesDumpFileName = string.Format("{0}{1}-latest-pages-meta-current.xml.bz2", srcLanguage, GetWikimediaExtension(wikimedia)); var srcPageFilePath = dumpDownloader.DownloadFile(pagesDumpFileName); var parser = new WiktionaryDumpParser.Src.WiktionaryDumpParser(); var translatedEntities = parser.ExtractTranslatedEntities(srcPageFilePath, srcLanguage, tgtLanguage); return(translatedEntities); }
private string GetWikimediaExtension(Wikimedia wikimedia) { switch (wikimedia) { case Wikimedia.Wikipedia: return("wiki"); case Wikimedia.Wiktionary: return("wiktionary"); default: throw new ArgumentException("Wikimedia '{0}' is not supported", Enum.GetName(typeof(Wikimedia), wikimedia)); } }
static void Main(string[] args) { if (!ParseArgs(args)) { Usage(); return; } var startTime = DateTime.Now; var wm = new Wikimedia(inputFile); IEnumerable <WikimediaPage> articles = wm.Articles .Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage); if (!string.IsNullOrEmpty(title)) { var targetArticle = articles.FirstOrDefault( article => article.Title.Equals(title, StringComparison.CurrentCultureIgnoreCase)); if (targetArticle == null) { Console.WriteLine("Could not find article '{0}'", title); return; } File.WriteAllText(outputFile, rawText ? targetArticle.Text : targetArticle.Plaintext); } else { if (articleLimit > 0) { articles = articles.Take(articleLimit); } var numberOfArticles = Wikimedia.WriteToDisk(articles, outputFile); Console.WriteLine("Wrote {0} articles to disk.", numberOfArticles); } var endTime = DateTime.Now; TimeSpan processTime = endTime - startTime; Console.WriteLine("Process took " + processTime); }
/// <summary> /// Creates a translation dictionary between <paramref name="srcLanguage"/> and <paramref name="tgtLanguage"/> by: /// - downloading the appropriate files /// - extracting information from them /// - associating the info of the different files /// </summary> /// <param name="srcLanguage">The ISO-639-1 code for the source language</param> /// <param name="tgtLanguage">The ISO-639-1 code for the target language</param> /// <param name="wikimedia">The wikipedia resource (wikipedia, witionary...)</param> /// <returns>The collection of translated entities</returns> public string CreateDictionary(string srcLanguage, string tgtLanguage, Wikimedia wikimedia, DumpDownloader dumpDownloader) { Console.WriteLine("Start creating dictionary {0}-{1}", srcLanguage, tgtLanguage); // Creates the output file name var outputFilePath = PathToProject + string.Format("Output\\{0}-{1}-{2}-dictionary.txt", srcLanguage, tgtLanguage, Enum.GetName(typeof(Wikimedia), wikimedia)); TranslatedEntities translatedEntities; if (wikimedia == Wikimedia.Wikipedia) { // Creates the dictionary from the language links between the pages translatedEntities = ExtractTranslatedEntitiesFromLanguageLinks(srcLanguage, tgtLanguage, wikimedia, dumpDownloader); } else if (wikimedia == Wikimedia.Wiktionary) { // Creates the dictionary from the content of wiktionary pages (a translation section is sometimes present on wiktionary) translatedEntities = ExtractTranslatedEntitiesFromWiktionaryContent(srcLanguage, tgtLanguage, wikimedia, dumpDownloader); } else { throw new ArgumentException(string.Format("Wikimedia '{0}' is not supported for building translation dictionaries", Enum.GetName(typeof(Wikimedia), wikimedia))); } // Write all translated entities if (File.Exists(outputFilePath)) { File.Delete(outputFilePath); } File.AppendAllLines(outputFilePath, translatedEntities.GetTextFileLines()); Console.WriteLine("Finished creating dictionary {0}-{1}", srcLanguage, tgtLanguage); Console.WriteLine("----"); return(outputFilePath); }
/// <summary> /// Creates a translation dictionary from languagelinks for a wikimedia resource. Language links are a specific kind of /// interwikilinks (ie links between two different domains - en.wikipedia.org and fr.wikipedia.org for instance). /// See https://en.wiktionary.org/wiki/Help:FAQ#What_are_interwiki_links.3F for more details. /// </summary> /// <param name="srcLanguage">The ISO-639-1 code for the source language</param> /// <param name="tgtLanguage">The ISO-639-1 code for the target language</param> /// <param name="wikimedia">The wikipedia resource (wikipedia, witionary...)</param> /// <returns>The collection of translated entities</returns> private TranslatedEntities ExtractTranslatedEntitiesFromLanguageLinks(string srcLanguage, string tgtLanguage, Wikimedia wikimedia, DumpDownloader dumpDownloader) { var translatedEntities = new TranslatedEntities() { SrcLanguage = srcLanguage, TgtLanguage = tgtLanguage, Entities = new List <TranslatedEntity>() }; // Download dump files with pages and langlinks of the source language (always take the latest version) var pageDumpFileName = string.Format("{0}{1}-latest-page.sql.gz", srcLanguage, GetWikimediaExtension(wikimedia)); var srcPagePropsFilePath = dumpDownloader.DownloadFile(pageDumpFileName); var langLinksDumpFileName = string.Format("{0}{1}-latest-langlinks.sql.gz", srcLanguage, GetWikimediaExtension(wikimedia)); var srcLangLinksFilePath = dumpDownloader.DownloadFile(langLinksDumpFileName); // Parse language links and load them in dictionary for fast retrieval Console.WriteLine("Start parsing language links"); var parser = new DumpParser(); var languageLinks = parser.ParseLanguageLinks(srcLangLinksFilePath, tgtLanguage) .ToDictionary(ll => ll.Id, ll => ll); Console.WriteLine("{0} language links found", languageLinks.Count()); // Associate the pages (with title in src language) with the language links (with title in tgt language) Console.WriteLine("Start associating pages and language links"); var counter = 0; var pageInfoReader = new SqlDumpFileReader(srcPagePropsFilePath); var pageInfo = pageInfoReader.ReadNext(); while (pageInfo != null) { LanguageLink languageLink; if (languageLinks.TryGetValue(pageInfo.Id, out languageLink)) { counter++; translatedEntities.Entities.Add(new TranslatedEntity() { SrcName = pageInfo.GetDisplayedTitle(), TgtName = languageLink.GetDisplayedTitle() }); } pageInfo = pageInfoReader.ReadNext(); } Console.WriteLine("Associated {0} entries for {1}-{2}", counter, srcLanguage, tgtLanguage); return(translatedEntities); }