public static void Convert(GeneralUris generalUris, string srcFile, string destDir, SiteInfo website) { string fileNameOnly = Path.GetFileName(srcFile); Console.WriteLine("Processing {0} ...", fileNameOnly); var siteUris = new SiteUris(generalUris, website); var nsMapper = siteUris.CreateNamespaceMapper(); using (var destWriter = new SequentialTurtleWriter(File.CreateText(Path.Combine(destDir, website + "-" + Path.GetFileNameWithoutExtension(srcFile) + ".ttl")), nsMapper)) { using (var fs = File.OpenRead(srcFile)) { using (var reader = XmlReader.Create(fs)) { while (reader.NodeType != XmlNodeType.Element) { if (!reader.Read()) { ConsoleHelper.WriteErrorLine("No contents found in file {0}.", srcFile); return; } } switch (reader.LocalName) { case "badges": ConsoleHelper.WriteInfoLine("List of badges identified."); ConvertBadges(siteUris, reader, destWriter); break; case "comments": ConsoleHelper.WriteInfoLine("List of comments identified."); ConvertComments(siteUris, reader, destWriter); break; case "posthistory": ConsoleHelper.WriteInfoLine("List of posthistory identified."); ConvertPostHistory(siteUris, reader, destWriter); break; case "postlinks": ConsoleHelper.WriteInfoLine("List of postlinks identified."); ConvertPostLinks(siteUris, reader, destWriter); break; case "posts": ConsoleHelper.WriteInfoLine("List of posts identified."); ConvertPosts(siteUris, reader, destWriter); break; case "tags": ConsoleHelper.WriteInfoLine("List of tags identified."); ConvertTags(siteUris, reader, destWriter); break; case "users": ConsoleHelper.WriteInfoLine("List of users identified."); ConvertUsers(siteUris, reader, destWriter); break; case "votes": ConsoleHelper.WriteInfoLine("List of votes identified."); ConvertVotes(siteUris, reader, destWriter); break; default: ConsoleHelper.WriteWarningLine("Unknown root element \"{0}\". Skipping document.", reader.LocalName); break; } } } GlobalData.UpdateStats(destWriter); } Console.WriteLine("Conversion of {0} completed.", fileNameOnly); }
/// <summary> /// Downloads and converts the data. /// </summary> /// <param name="generalUris">An object that provides general URIs used in the exported dataset.</param> /// <param name="filelist">The URL of a filelist Xml file.</param> /// <exception cref="ArgumentNullException">Any of the arguments is <see langword="null"/>.</exception> private static void RetrieveData(GeneralUris generalUris, Uri filelist) { if (generalUris == null) { throw new ArgumentNullException("generalUris"); } if (filelist == null) { throw new ArgumentNullException("filelist"); } string tempDir = Path.Combine(BaseDir, "tmp"); string destDir = Path.Combine(BaseDir, "rdf"); Directory.CreateDirectory(tempDir); Directory.CreateDirectory(destDir); DateTime startTime = DateTime.Now; ConsoleHelper.WriteInfoLine("Current time: {0:yyyy-MM-dd HH:mm:ss}", startTime); if (!GlobalData.Options.OntologyOnly) { ConsoleHelper.WriteMilestone("Downloading files list ..."); using (var client = new WebClient()) { client.DownloadFile(filelist, Path.Combine(tempDir, "files.xml")); } Console.WriteLine(" done."); var files = LoadFilesList(Path.Combine(tempDir, "files.xml")).OrderBy(f => f).ToArray(); ConsoleHelper.WriteInfoLine("{0} file(s) in list, totalling to a compressed size of {1:F1} GB.", files.Length, (double)files.Sum(f => f.Size) / 1024 / 1024 / 1024); int processedFilesCount = 0; foreach (var f in files) { SiteInfo siteInfo; try { siteInfo = f.RetrieveSiteInfo(); } catch (ArgumentException ex) { ConsoleHelper.WriteErrorLine("Skipping file {0}, as it cannot be associated with a website.\n{1}", f, ex); continue; } if (IncludeSite(siteInfo)) { GlobalData.Sites[siteInfo.Id] = siteInfo; if (!GlobalData.Options.SiteListOnly) { string fn = f.Download(filelist, tempDir); string[] rawFiles = null; switch (Path.GetExtension(fn)) { case ".7z": rawFiles = ExtractSevenZipArchive(fn); break; default: ConsoleHelper.WriteWarningLine("File {0} has an unknown file extension.", fn); break; } if (rawFiles != null) { ConsoleHelper.WriteInfoLine("{0} file(s) extracted.", rawFiles.Length); foreach (var rawFile in rawFiles) { Converter.Convert(generalUris, rawFile, destDir, siteInfo); } } } processedFilesCount++; if (processedFilesCount >= GlobalData.Options.MaxFileCount) { break; } } } } GlobalInformationConverter.Convert(generalUris, tempDir, filelist, destDir); if (!GlobalData.Options.KeepTemporaryFiles) { Console.Write("Removing temporary files ... "); try { Directory.Delete(tempDir, true); Console.WriteLine(" done."); } catch { ConsoleHelper.WriteErrorLine("Please remove the directory {0} manually.", tempDir); } } Console.WriteLine(); GlobalData.PrintStats(); DateTime endTime = DateTime.Now; ConsoleHelper.WriteInfoLine("Current time: {0:yyyy-MM-dd HH:mm:ss}", endTime); ConsoleHelper.WriteInfoLine("Total duration: {0}", endTime - startTime); }