Esempio n. 1
0
        public static void Main(string[] args)
        {
            var culture = new System.Globalization.CultureInfo("en-US");

            System.Threading.Thread.CurrentThread.CurrentCulture   = culture;
            System.Threading.Thread.CurrentThread.CurrentUICulture = culture;

            try {
                using (var parser = new CommandLine.Parser()) {
                    if (!parser.ParseArguments(args, GlobalData.Options))
                    {
                        Console.WriteLine(GlobalData.Options.GetUsage());
                        return;
                    }
                }

                if (GlobalData.Options.MaxFileCount < 1)
                {
                    ConsoleHelper.WriteErrorLine("As the maximum file count is set to {0}, no files can be imported.",
                                                 GlobalData.Options.MaxFileCount);
                    return;
                }
                if (GlobalData.Options.Files.Count < 1)
                {
                    ConsoleHelper.WriteInfoLine("No input file specified. Using default URL {0}.",
                                                DefaultFileUrl);
                    GlobalData.Options.Files.Add(DefaultFileUrl);
                }

                if (!string.IsNullOrWhiteSpace(GlobalData.Options.SiteNamePattern))
                {
                    try {
                        GlobalData.SiteNamePattern = new Regex(GlobalData.Options.SiteNamePattern);
                    }
                    catch (ArgumentException) {
                        ConsoleHelper.WriteErrorLine("Invalid regular expression: \"{0}\"",
                                                     GlobalData.Options.SiteNamePattern);
                        return;
                    }
                }

                GeneralUris generalUris = new GeneralUris(GlobalData.Options.GeneralPrefix);

                try {
                    Uri filelist = new Uri(GlobalData.Options.Files[0]);

                    RetrieveData(generalUris, filelist);
                }
                catch (UriFormatException) {
                    Console.WriteLine("Invalid filelist URL:");
                    throw;
                }
            }
            catch (Exception ex) {
                Console.WriteLine(ex.ToString());
            }
        }
Esempio n. 2
0
        public static void Convert(GeneralUris generalUris, string srcFile, string destDir, SiteInfo website)
        {
            string fileNameOnly = Path.GetFileName(srcFile);

            Console.WriteLine("Processing {0} ...", fileNameOnly);

            var siteUris = new SiteUris(generalUris, website);

            var nsMapper = siteUris.CreateNamespaceMapper();

            using (var destWriter = new SequentialTurtleWriter(File.CreateText(Path.Combine(destDir, website + "-" + Path.GetFileNameWithoutExtension(srcFile) + ".ttl")), nsMapper)) {
                using (var fs = File.OpenRead(srcFile)) {
                    using (var reader = XmlReader.Create(fs)) {
                        while (reader.NodeType != XmlNodeType.Element)
                        {
                            if (!reader.Read())
                            {
                                ConsoleHelper.WriteErrorLine("No contents found in file {0}.", srcFile);
                                return;
                            }
                        }

                        switch (reader.LocalName)
                        {
                        case "badges":
                            ConsoleHelper.WriteInfoLine("List of badges identified.");
                            ConvertBadges(siteUris, reader, destWriter);
                            break;

                        case "comments":
                            ConsoleHelper.WriteInfoLine("List of comments identified.");
                            ConvertComments(siteUris, reader, destWriter);
                            break;

                        case "posthistory":
                            ConsoleHelper.WriteInfoLine("List of posthistory identified.");
                            ConvertPostHistory(siteUris, reader, destWriter);
                            break;

                        case "postlinks":
                            ConsoleHelper.WriteInfoLine("List of postlinks identified.");
                            ConvertPostLinks(siteUris, reader, destWriter);
                            break;

                        case "posts":
                            ConsoleHelper.WriteInfoLine("List of posts identified.");
                            ConvertPosts(siteUris, reader, destWriter);
                            break;

                        case "tags":
                            ConsoleHelper.WriteInfoLine("List of tags identified.");
                            ConvertTags(siteUris, reader, destWriter);
                            break;

                        case "users":
                            ConsoleHelper.WriteInfoLine("List of users identified.");
                            ConvertUsers(siteUris, reader, destWriter);
                            break;

                        case "votes":
                            ConsoleHelper.WriteInfoLine("List of votes identified.");
                            ConvertVotes(siteUris, reader, destWriter);
                            break;

                        default:
                            ConsoleHelper.WriteWarningLine("Unknown root element \"{0}\". Skipping document.", reader.LocalName);
                            break;
                        }
                    }
                }

                GlobalData.UpdateStats(destWriter);
            }

            Console.WriteLine("Conversion of {0} completed.", fileNameOnly);
        }
Esempio n. 3
0
        /// <summary>
        /// Downloads and converts the data.
        /// </summary>
        /// <param name="generalUris">An object that provides general URIs used in the exported dataset.</param>
        /// <param name="filelist">The URL of a filelist Xml file.</param>
        /// <exception cref="ArgumentNullException">Any of the arguments is <see langword="null"/>.</exception>
        private static void RetrieveData(GeneralUris generalUris, Uri filelist)
        {
            if (generalUris == null)
            {
                throw new ArgumentNullException("generalUris");
            }
            if (filelist == null)
            {
                throw new ArgumentNullException("filelist");
            }

            string tempDir = Path.Combine(BaseDir, "tmp");
            string destDir = Path.Combine(BaseDir, "rdf");

            Directory.CreateDirectory(tempDir);
            Directory.CreateDirectory(destDir);

            DateTime startTime = DateTime.Now;

            ConsoleHelper.WriteInfoLine("Current time: {0:yyyy-MM-dd HH:mm:ss}", startTime);

            if (!GlobalData.Options.OntologyOnly)
            {
                ConsoleHelper.WriteMilestone("Downloading files list ...");
                using (var client = new WebClient()) {
                    client.DownloadFile(filelist, Path.Combine(tempDir, "files.xml"));
                }
                Console.WriteLine(" done.");

                var files = LoadFilesList(Path.Combine(tempDir, "files.xml")).OrderBy(f => f).ToArray();
                ConsoleHelper.WriteInfoLine("{0} file(s) in list, totalling to a compressed size of {1:F1} GB.",
                                            files.Length, (double)files.Sum(f => f.Size) / 1024 / 1024 / 1024);

                int processedFilesCount = 0;
                foreach (var f in files)
                {
                    SiteInfo siteInfo;
                    try {
                        siteInfo = f.RetrieveSiteInfo();
                    }
                    catch (ArgumentException ex) {
                        ConsoleHelper.WriteErrorLine("Skipping file {0}, as it cannot be associated with a website.\n{1}", f, ex);
                        continue;
                    }

                    if (IncludeSite(siteInfo))
                    {
                        GlobalData.Sites[siteInfo.Id] = siteInfo;

                        if (!GlobalData.Options.SiteListOnly)
                        {
                            string fn = f.Download(filelist, tempDir);

                            string[] rawFiles = null;
                            switch (Path.GetExtension(fn))
                            {
                            case ".7z":
                                rawFiles = ExtractSevenZipArchive(fn);
                                break;

                            default:
                                ConsoleHelper.WriteWarningLine("File {0} has an unknown file extension.", fn);
                                break;
                            }

                            if (rawFiles != null)
                            {
                                ConsoleHelper.WriteInfoLine("{0} file(s) extracted.", rawFiles.Length);

                                foreach (var rawFile in rawFiles)
                                {
                                    Converter.Convert(generalUris, rawFile, destDir, siteInfo);
                                }
                            }
                        }

                        processedFilesCount++;
                        if (processedFilesCount >= GlobalData.Options.MaxFileCount)
                        {
                            break;
                        }
                    }
                }
            }

            GlobalInformationConverter.Convert(generalUris, tempDir, filelist, destDir);

            if (!GlobalData.Options.KeepTemporaryFiles)
            {
                Console.Write("Removing temporary files ... ");
                try {
                    Directory.Delete(tempDir, true);
                    Console.WriteLine(" done.");
                }
                catch {
                    ConsoleHelper.WriteErrorLine("Please remove the directory {0} manually.", tempDir);
                }
            }

            Console.WriteLine();
            GlobalData.PrintStats();

            DateTime endTime = DateTime.Now;

            ConsoleHelper.WriteInfoLine("Current time: {0:yyyy-MM-dd HH:mm:ss}", endTime);
            ConsoleHelper.WriteInfoLine("Total duration: {0}", endTime - startTime);
        }