private static void ConvertPosts(SiteUris uris, XmlReader r, SequentialTurtleWriter w)
        {
            var unknownPostTypeIds = new UnknownValueStore <string>();

            while (r.Read())
            {
                switch (r.NodeType)
                {
                case XmlNodeType.Element:
                    switch (r.LocalName)
                    {
                    case "row":
                        using (var subR = r.ReadSubtree()) {
                            subR.Read();
                            ConvertPost(uris, subR, w, unknownPostTypeIds);
                        }
                        break;
                    }
                    break;

                case XmlNodeType.EndElement:
                    long unknownPostTypeIdCount = unknownPostTypeIds.RegisteredValueCount;
                    if (unknownPostTypeIdCount > 0)
                    {
                        ConsoleHelper.WriteWarningLine("{0} unknown PostTypeId value(s) found: {1}", unknownPostTypeIdCount, unknownPostTypeIds);
                    }

                    return;
                }
            }
        }
示例#2
0
        private static bool ConvertSite(GeneralUris generalUris, XmlReader r, SequentialTurtleWriter w)
        {
            Uri    subjectUri;
            string address;

            if (r.MoveToAttribute("Address"))
            {
                SiteInfo info;
                if (GlobalData.Sites.TryGetValue(r.Value, out info))
                {
                    address    = r.Value;
                    subjectUri = generalUris.CreateSiteUri(info);
                    w.StartTriple(subjectUri);
                    w.AddToTriple(generalUris.IsMetaSiteProperty, info.IsMetaSite);
                    w.AddToTriple(generalUris.LanguageProperty, info.IsEnglishSite ? "en" : info.Language);
                }
                else
                {
                    return(false);
                }
            }
            else
            {
                r.MoveToElement();
                ConsoleHelper.WriteErrorLine("No Address attribute found on element {0}. Skipping element.", r.ReadOuterXml());
                return(false);
            }

            w.AddToTriple(generalUris.TypeProperty, generalUris.SiteInfoType);
            w.AddToTriple(generalUris.WebsiteProperty, new Uri("http://" + address));
            if (r.MoveToAttribute("Name"))
            {
                w.AddToTriple(generalUris.LabelProperty, r.Value);
                w.AddToTriple(generalUris.TitleProperty, r.Value);
            }
            if (r.MoveToAttribute("Description"))
            {
                w.AddToTriple(generalUris.DescriptionProperty, r.Value);
            }
            if (r.MoveToAttribute("ParentAddress"))
            {
                SiteInfo parentInfo;
                if (GlobalData.Sites.TryGetValue(r.Value, out parentInfo))
                {
                    w.AddToTriple(generalUris.ParentSiteProperty, generalUris.CreateSiteUri(parentInfo));
                }
                else
                {
                    ConsoleHelper.WriteWarningLine("Unknown parent site {0}; skipping information.", r.Value);
                }
            }

            return(true);
        }
 private static void LinkToPost(SiteUris uris, Uri subjectUri, XmlReader r, SequentialTurtleWriter w)
 {
     if (r.MoveToAttribute("PostId"))
     {
         w.StartTriple(uris.CreatePostUri(r.Value));
         w.AddToTriple(uris.GeneralUris.EventProperty, subjectUri);
         w.StartTriple(subjectUri);
     }
     else
     {
         ConsoleHelper.WriteWarningLine("Orphaned post history item: {0}", subjectUri.AbsoluteUri);
     }
 }
        private static void ConvertUsers(SiteUris uris, XmlReader r, SequentialTurtleWriter w)
        {
            var  malformedIris          = new List <string>();
            long totalMalformedIriCount = 0;

            while (r.Read())
            {
                switch (r.NodeType)
                {
                case XmlNodeType.Element:
                    switch (r.LocalName)
                    {
                    case "row":
                        using (var subR = r.ReadSubtree()) {
                            subR.Read();
                            ConvertUser(uris, subR, w, malformedIris, ref totalMalformedIriCount);
                        }
                        break;
                    }
                    break;

                case XmlNodeType.EndElement:
                    if (totalMalformedIriCount > 0)
                    {
                        string example;
                        if (malformedIris.Count > 0)
                        {
                            var exampleBuilder = new System.Text.StringBuilder(" (e.g. ");
                            for (int i = 0; i < malformedIris.Count; i++)
                            {
                                if (i > 0)
                                {
                                    exampleBuilder.Append("; ");
                                }
                                exampleBuilder.Append(malformedIris[i]);
                            }
                            exampleBuilder.Append(")");
                            example = exampleBuilder.ToString();
                        }
                        else
                        {
                            example = "";
                        }
                        ConsoleHelper.WriteWarningLine("{1} malformed URL(s) found{0}, treated as string literals.", example, totalMalformedIriCount);
                    }
                    return;
                }
            }
        }
示例#5
0
        /// <summary>
        /// Downloads the file to the local file system.
        /// </summary>
        /// <param name="baseUri">The base URI that the relative file name can be expanded from.</param>
        /// <param name="directory">The destination directory.</param>
        /// <returns>The path to the downloaded file.</returns>
        /// <exception cref="ArgumentNullException">Any of the arguments is <see langword="null"/>.</exception>
        /// <remarks>
        /// <para>This method downloads the file to the local file system.
        ///   After downloading, the MD5 hash of the file contents is verified.</para>
        /// </remarks>
        public string Download(Uri baseUri, string directory)
        {
            if (baseUri == null)
            {
                throw new ArgumentNullException("baseUri");
            }

            string fn = Path.Combine(directory, name);

            ConsoleHelper.WriteMilestone("Downloading {0} ...", name);
            using (var client = new WebClient()) {
                client.DownloadFile(new Uri(baseUri, name), fn);
            }
            System.Console.WriteLine(" done.");

            var fInfo = new FileInfo(fn);

            long fileSize = fInfo.Length;

            if (fileSize == this.size)
            {
                ConsoleHelper.WriteSuccessLine("File size of {0} bytes verified.", fileSize);
            }
            else
            {
                ConsoleHelper.WriteWarningLine("File size is {0} bytes, which differs from the expected size of {1} bytes.", fileSize, this.size);
            }

            byte[] hash;
            using (var algo = MD5.Create()) {
                using (var fs = fInfo.OpenRead()) {
                    hash = algo.ComputeHash(fs);
                }
            }
            string fileMD5 = string.Join("", hash.Select(b => string.Format(System.Globalization.CultureInfo.InvariantCulture, "{0:X2}", b))).ToLowerInvariant();

            if (fileMD5 == this.md5)
            {
                ConsoleHelper.WriteSuccessLine("MD5 hash ({0}) verified.", fileMD5);
            }
            else
            {
                ConsoleHelper.WriteWarningLine("MD5 hash of file ({0}) does not match the expected one ({1}).", fileMD5, this.md5);
            }

            return(fn);
        }
        private static void ConvertComment(SiteUris uris, XmlReader r, SequentialTurtleWriter w)
        {
            Uri subjectUri;

            if (r.MoveToAttribute("Id"))
            {
                subjectUri = uris.CreateCommentUri(r.Value);
                w.StartTriple(subjectUri);
            }
            else
            {
                r.MoveToElement();
                ConsoleHelper.WriteErrorLine("No Id attribute found on element {0}. Skipping element.", r.ReadOuterXml());
                return;
            }

            w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.CommentType);
            uris.LinkToSite(w);
            if (r.MoveToAttribute("Score"))
            {
                w.AddToTriple(uris.GeneralUris.ScoreProperty, long.Parse(r.Value));
            }
            if (r.MoveToAttribute("Text"))
            {
                w.AddToTriple(uris.GeneralUris.DescriptionProperty, r.Value);
            }
            if (r.MoveToAttribute("CreationDate"))
            {
                w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture));
            }
            if (r.MoveToAttribute("UserId"))
            {
                w.AddToTriple(uris.GeneralUris.OwnerProperty, uris.CreateUserUri(r.Value));
            }

            if (r.MoveToAttribute("PostId"))
            {
                w.StartTriple(uris.CreatePostUri(r.Value));
                w.AddToTriple(uris.GeneralUris.CommentProperty, subjectUri);
                w.StartTriple(subjectUri);
            }
            else
            {
                ConsoleHelper.WriteWarningLine("Orphaned comment: {0}", subjectUri);
            }
        }
示例#7
0
        private static void ConvertSiteList(GeneralUris generalUris, string tempDir, Uri baseUri, string destDir, VDS.RDF.INamespaceMapper nsMapper)
        {
            string srcFile = Path.Combine(tempDir, "Sites.xml");

            ConsoleHelper.WriteMilestone("Downloading site list ...");
            using (var client = new WebClient()) {
                client.DownloadFile(new Uri(baseUri, "Sites.xml"), srcFile);
            }
            Console.WriteLine(" done.");

            using (var destWriter = new SequentialTurtleWriter(File.CreateText(Path.Combine(destDir, "_sites.ttl")), nsMapper)) {
                using (var fs = File.OpenRead(srcFile)) {
                    using (var reader = XmlReader.Create(fs)) {
                        while (reader.NodeType != XmlNodeType.Element)
                        {
                            if (!reader.Read())
                            {
                                ConsoleHelper.WriteErrorLine("No contents found in file {0}.", srcFile);
                                return;
                            }
                        }

                        if (reader.LocalName == "sitelist")
                        {
                            ConvertSites(generalUris, reader, destWriter);
                        }
                        else
                        {
                            ConsoleHelper.WriteWarningLine("Unknown root element \"{0}\". Skipping document.", reader.LocalName);
                        }
                    }
                }

                GlobalData.UpdateStats(destWriter);
            }

            Console.WriteLine("Conversion of site list completed.");
        }
        private static void ConvertPostHistoryItem(SiteUris uris, XmlReader r, SequentialTurtleWriter w, UnknownValueStore <string> unknownPostHistoryTypeIds)
        {
            Uri subjectUri;

            if (r.MoveToAttribute("Id"))
            {
                subjectUri = uris.CreatePostHistoryUri(r.Value);
                w.StartTriple(subjectUri);
            }
            else
            {
                r.MoveToElement();
                ConsoleHelper.WriteErrorLine("No Id attribute found on element {0}. Skipping element.", r.ReadOuterXml());
                return;
            }

            if (r.MoveToAttribute("PostHistoryTypeId"))
            {
                switch (r.Value)
                {
                case "1":                         // initial title
                    break;

                case "2":                         // initial body
                    break;

                case "3":                         // initial tags
                    break;

                case "4":                         // edit title
                    break;

                case "5":                         // edit body
                    break;

                case "6":                         // edit tags
                    break;

                case "7":                         // rollback title
                    break;

                case "8":                         // rollback body
                    break;

                case "9":                         // rollback tags
                    break;

                case "10":                         // post closed
                    w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostClosureType);
                    if (r.MoveToAttribute("Comment"))
                    {
                        switch (r.Value)
                        {
                        case "1":                                         // Exact Duplicate
                        case "101":                                       // duplicate
                            w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.DuplicateCloseReason);
                            break;

                        case "2":                                         // Off-topic
                        case "102":                                       // Off-topic
                            w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.OffTopicCloseReason);
                            break;

                        case "3":                                         // Subjective and argumentative
                        case "105":                                       // Primarily opinion-based
                            w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.SubjectiveCloseReason);
                            break;

                        case "4":                                         // Not a real question
                            w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.NotAQuestionCloseReason);
                            break;

                        case "7":                                         // Too localized
                            w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.TooLocalizedCloseReason);
                            break;

                        case "10":                                         // General reference
                            w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.GeneralReferenceCloseReason);
                            break;

                        case "20":                                         // Noise or pointless
                            w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.NoiseCloseReason);
                            break;

                        case "103":                                         // Unclear what you're asking
                            w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.UnclearCloseReason);
                            break;

                        case "104":                                         // Too broad
                            w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.TooBroadCloseReason);
                            break;

                        default:
                            ConsoleHelper.WriteWarningLine("Unknown post close reason: {0}", r.Value);
                            break;
                        }
                    }
                    if (r.MoveToAttribute("CreationDate"))
                    {
                        w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture));
                    }
                    AddParticipants(uris, r, w);
                    LinkToPost(uris, subjectUri, r, w);
                    break;

                case "11":                         // post reopened
                    w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostReopeningType);
                    if (r.MoveToAttribute("CreationDate"))
                    {
                        w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture));
                    }
                    AddParticipants(uris, r, w);
                    LinkToPost(uris, subjectUri, r, w);
                    break;

                case "12":                         // post deleted
                    w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostDeletionType);
                    if (r.MoveToAttribute("CreationDate"))
                    {
                        w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture));
                    }
                    AddParticipants(uris, r, w);
                    LinkToPost(uris, subjectUri, r, w);
                    break;

                case "13":                         // post undeleted
                    w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostUndeletionType);
                    if (r.MoveToAttribute("CreationDate"))
                    {
                        w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture));
                    }
                    AddParticipants(uris, r, w);
                    LinkToPost(uris, subjectUri, r, w);
                    break;

                case "14":                         // post locked
                    w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostLockingType);
                    if (r.MoveToAttribute("CreationDate"))
                    {
                        w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture));
                    }
                    AddParticipants(uris, r, w);
                    LinkToPost(uris, subjectUri, r, w);
                    break;

                case "15":                         // post unlocked
                    w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostUnlockingType);
                    if (r.MoveToAttribute("CreationDate"))
                    {
                        w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture));
                    }
                    AddParticipants(uris, r, w);
                    LinkToPost(uris, subjectUri, r, w);
                    break;

                case "16":                         // community owned
                    break;

                case "17":                         // post migrated superseded with 35/36
                    break;

                case "18":                         // question merged
                    break;

                case "19":                         // question protected
                    w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostProtectionType);
                    if (r.MoveToAttribute("CreationDate"))
                    {
                        w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture));
                    }
                    AddParticipants(uris, r, w);
                    LinkToPost(uris, subjectUri, r, w);
                    break;

                case "20":                         // question unprotected
                    w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostUnprotectionType);
                    if (r.MoveToAttribute("CreationDate"))
                    {
                        w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture));
                    }
                    AddParticipants(uris, r, w);
                    LinkToPost(uris, subjectUri, r, w);
                    break;

                case "21":                         // post disassociated
                    break;

                case "22":                         // question unmerged
                    break;

                case "24":                         // suggested edit applied
                    break;

                case "25":                         // post tweeted
                    break;

                case "31":                         // comment discussion moved to chat
                    break;

                case "33":                         // post notice added
                    break;

                case "34":                         // post notice removed
                    break;

                case "35":                         // post migrated away replaces 17
                    break;

                case "36":                         // post migrated here replaces 17
                    break;

                case "37":                         // post merge source
                    break;

                case "38":                         // post merge destination
                    break;

                default:
                    unknownPostHistoryTypeIds.RegisterUnknownValue(r.Value);
                    break;
                }
            }
            else
            {
                r.MoveToElement();
                ConsoleHelper.WriteErrorLine("No PostHistoryTypeId attribute found on element {0}. Skipping element.", r.ReadOuterXml());
            }
        }
示例#9
0
        public static void Convert(GeneralUris generalUris, string srcFile, string destDir, SiteInfo website)
        {
            string fileNameOnly = Path.GetFileName(srcFile);

            Console.WriteLine("Processing {0} ...", fileNameOnly);

            var siteUris = new SiteUris(generalUris, website);

            var nsMapper = siteUris.CreateNamespaceMapper();

            using (var destWriter = new SequentialTurtleWriter(File.CreateText(Path.Combine(destDir, website + "-" + Path.GetFileNameWithoutExtension(srcFile) + ".ttl")), nsMapper)) {
                using (var fs = File.OpenRead(srcFile)) {
                    using (var reader = XmlReader.Create(fs)) {
                        while (reader.NodeType != XmlNodeType.Element)
                        {
                            if (!reader.Read())
                            {
                                ConsoleHelper.WriteErrorLine("No contents found in file {0}.", srcFile);
                                return;
                            }
                        }

                        switch (reader.LocalName)
                        {
                        case "badges":
                            ConsoleHelper.WriteInfoLine("List of badges identified.");
                            ConvertBadges(siteUris, reader, destWriter);
                            break;

                        case "comments":
                            ConsoleHelper.WriteInfoLine("List of comments identified.");
                            ConvertComments(siteUris, reader, destWriter);
                            break;

                        case "posthistory":
                            ConsoleHelper.WriteInfoLine("List of posthistory identified.");
                            ConvertPostHistory(siteUris, reader, destWriter);
                            break;

                        case "postlinks":
                            ConsoleHelper.WriteInfoLine("List of postlinks identified.");
                            ConvertPostLinks(siteUris, reader, destWriter);
                            break;

                        case "posts":
                            ConsoleHelper.WriteInfoLine("List of posts identified.");
                            ConvertPosts(siteUris, reader, destWriter);
                            break;

                        case "tags":
                            ConsoleHelper.WriteInfoLine("List of tags identified.");
                            ConvertTags(siteUris, reader, destWriter);
                            break;

                        case "users":
                            ConsoleHelper.WriteInfoLine("List of users identified.");
                            ConvertUsers(siteUris, reader, destWriter);
                            break;

                        case "votes":
                            ConsoleHelper.WriteInfoLine("List of votes identified.");
                            ConvertVotes(siteUris, reader, destWriter);
                            break;

                        default:
                            ConsoleHelper.WriteWarningLine("Unknown root element \"{0}\". Skipping document.", reader.LocalName);
                            break;
                        }
                    }
                }

                GlobalData.UpdateStats(destWriter);
            }

            Console.WriteLine("Conversion of {0} completed.", fileNameOnly);
        }
        private static void ConvertPost(SiteUris uris, XmlReader r, SequentialTurtleWriter w, UnknownValueStore <string> unknownPostTypeIds)
        {
            Uri subjectUri;

            if (r.MoveToAttribute("Id"))
            {
                subjectUri = uris.CreatePostUri(r.Value);
                w.StartTriple(subjectUri);
            }
            else
            {
                r.MoveToElement();
                ConsoleHelper.WriteErrorLine("No Id attribute found on element {0}. Skipping element.", r.ReadOuterXml());
                return;
            }

            if (r.MoveToAttribute("PostTypeId"))
            {
                switch (r.Value)
                {
                case "1":                         // question
                    w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.QuestionType);
                    uris.LinkToSite(w);
                    if (r.MoveToAttribute("AcceptedAnswerId"))
                    {
                        w.AddToTriple(uris.GeneralUris.AcceptedAnswerProperty, uris.CreatePostUri(r.Value));
                    }
                    if (r.MoveToAttribute("ViewCount"))
                    {
                        w.AddToTriple(uris.GeneralUris.ViewCountProperty, long.Parse(r.Value));
                    }
                    if (r.MoveToAttribute("Title"))
                    {
                        w.AddToTriple(uris.GeneralUris.TitleProperty, r.Value);
                        w.AddToTriple(uris.GeneralUris.LabelProperty, r.Value);
                    }
                    if (r.MoveToAttribute("Score"))
                    {
                        w.AddToTriple(uris.GeneralUris.ScoreProperty, long.Parse(r.Value));
                    }
                    break;

                case "2":                         // answer
                    w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.AnswerType);
                    uris.LinkToSite(w);
                    if (r.MoveToAttribute("ParentId"))
                    {
                        w.StartTriple(uris.CreatePostUri(r.Value));
                        w.AddToTriple(uris.GeneralUris.AnswerProperty, subjectUri);
                        w.StartTriple(subjectUri);
                    }
                    else
                    {
                        ConsoleHelper.WriteWarningLine("Orphaned answer: {0}", subjectUri);
                    }
                    if (r.MoveToAttribute("Score"))
                    {
                        w.AddToTriple(uris.GeneralUris.ScoreProperty, long.Parse(r.Value));
                    }
                    break;

                case "3":                         // orphaned tag wiki
                    break;

                case "4":                         // tag info excerpt
                    w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.TagExcerptType);
                    break;

                case "5":                         // tag description
                    w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.TagDescriptionType);
                    break;

                case "6":                         // moderator nomination
                    break;

                case "7":                         // "Wiki placeholder" (seems to only be the election description)
                    //w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.SiteInfoType);
                    break;

                default:
                    unknownPostTypeIds.RegisterUnknownValue(r.Value);
                    break;
                }
                if (r.MoveToAttribute("CreationDate"))
                {
                    w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture));
                }
                if (r.MoveToAttribute("LastEditDate"))
                {
                    w.AddToTriple(uris.GeneralUris.LastEditDateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture));
                }
                if (r.MoveToAttribute("LastActivity"))
                {
                    w.AddToTriple(uris.GeneralUris.LastActivityDateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture));
                }
                if (r.MoveToAttribute("OwnerUserId"))
                {
                    w.AddToTriple(uris.GeneralUris.OwnerProperty, uris.CreateUserUri(r.Value));
                }
                // TODO: LastEditorUserId (given in post history)
                // TODO: FavoriteCount (linked to users?)
                if (r.MoveToAttribute("Body"))
                {
                    w.AddToTriple(uris.GeneralUris.DescriptionProperty, r.Value);
                }
                if (r.MoveToAttribute("Tags"))
                {
                    w.AddToTriple(uris.GeneralUris.TagProperty,
                                  tagRegex.Matches(r.Value).Cast <Match>().Select(m => uris.CreateTagUri(m.Groups[1].Value)));
                }
            }
            else
            {
                r.MoveToElement();
                ConsoleHelper.WriteErrorLine("No PostTypeId attribute found on element {0}. Skipping element.", r.ReadOuterXml());
            }
        }
示例#11
0
        /// <summary>
        /// Downloads and converts the data.
        /// </summary>
        /// <param name="generalUris">An object that provides general URIs used in the exported dataset.</param>
        /// <param name="filelist">The URL of a filelist Xml file.</param>
        /// <exception cref="ArgumentNullException">Any of the arguments is <see langword="null"/>.</exception>
        private static void RetrieveData(GeneralUris generalUris, Uri filelist)
        {
            if (generalUris == null)
            {
                throw new ArgumentNullException("generalUris");
            }
            if (filelist == null)
            {
                throw new ArgumentNullException("filelist");
            }

            string tempDir = Path.Combine(BaseDir, "tmp");
            string destDir = Path.Combine(BaseDir, "rdf");

            Directory.CreateDirectory(tempDir);
            Directory.CreateDirectory(destDir);

            DateTime startTime = DateTime.Now;

            ConsoleHelper.WriteInfoLine("Current time: {0:yyyy-MM-dd HH:mm:ss}", startTime);

            if (!GlobalData.Options.OntologyOnly)
            {
                ConsoleHelper.WriteMilestone("Downloading files list ...");
                using (var client = new WebClient()) {
                    client.DownloadFile(filelist, Path.Combine(tempDir, "files.xml"));
                }
                Console.WriteLine(" done.");

                var files = LoadFilesList(Path.Combine(tempDir, "files.xml")).OrderBy(f => f).ToArray();
                ConsoleHelper.WriteInfoLine("{0} file(s) in list, totalling to a compressed size of {1:F1} GB.",
                                            files.Length, (double)files.Sum(f => f.Size) / 1024 / 1024 / 1024);

                int processedFilesCount = 0;
                foreach (var f in files)
                {
                    SiteInfo siteInfo;
                    try {
                        siteInfo = f.RetrieveSiteInfo();
                    }
                    catch (ArgumentException ex) {
                        ConsoleHelper.WriteErrorLine("Skipping file {0}, as it cannot be associated with a website.\n{1}", f, ex);
                        continue;
                    }

                    if (IncludeSite(siteInfo))
                    {
                        GlobalData.Sites[siteInfo.Id] = siteInfo;

                        if (!GlobalData.Options.SiteListOnly)
                        {
                            string fn = f.Download(filelist, tempDir);

                            string[] rawFiles = null;
                            switch (Path.GetExtension(fn))
                            {
                            case ".7z":
                                rawFiles = ExtractSevenZipArchive(fn);
                                break;

                            default:
                                ConsoleHelper.WriteWarningLine("File {0} has an unknown file extension.", fn);
                                break;
                            }

                            if (rawFiles != null)
                            {
                                ConsoleHelper.WriteInfoLine("{0} file(s) extracted.", rawFiles.Length);

                                foreach (var rawFile in rawFiles)
                                {
                                    Converter.Convert(generalUris, rawFile, destDir, siteInfo);
                                }
                            }
                        }

                        processedFilesCount++;
                        if (processedFilesCount >= GlobalData.Options.MaxFileCount)
                        {
                            break;
                        }
                    }
                }
            }

            GlobalInformationConverter.Convert(generalUris, tempDir, filelist, destDir);

            if (!GlobalData.Options.KeepTemporaryFiles)
            {
                Console.Write("Removing temporary files ... ");
                try {
                    Directory.Delete(tempDir, true);
                    Console.WriteLine(" done.");
                }
                catch {
                    ConsoleHelper.WriteErrorLine("Please remove the directory {0} manually.", tempDir);
                }
            }

            Console.WriteLine();
            GlobalData.PrintStats();

            DateTime endTime = DateTime.Now;

            ConsoleHelper.WriteInfoLine("Current time: {0:yyyy-MM-dd HH:mm:ss}", endTime);
            ConsoleHelper.WriteInfoLine("Total duration: {0}", endTime - startTime);
        }