private static void ConvertPosts(SiteUris uris, XmlReader r, SequentialTurtleWriter w) { var unknownPostTypeIds = new UnknownValueStore <string>(); while (r.Read()) { switch (r.NodeType) { case XmlNodeType.Element: switch (r.LocalName) { case "row": using (var subR = r.ReadSubtree()) { subR.Read(); ConvertPost(uris, subR, w, unknownPostTypeIds); } break; } break; case XmlNodeType.EndElement: long unknownPostTypeIdCount = unknownPostTypeIds.RegisteredValueCount; if (unknownPostTypeIdCount > 0) { ConsoleHelper.WriteWarningLine("{0} unknown PostTypeId value(s) found: {1}", unknownPostTypeIdCount, unknownPostTypeIds); } return; } } }
private static bool ConvertSite(GeneralUris generalUris, XmlReader r, SequentialTurtleWriter w) { Uri subjectUri; string address; if (r.MoveToAttribute("Address")) { SiteInfo info; if (GlobalData.Sites.TryGetValue(r.Value, out info)) { address = r.Value; subjectUri = generalUris.CreateSiteUri(info); w.StartTriple(subjectUri); w.AddToTriple(generalUris.IsMetaSiteProperty, info.IsMetaSite); w.AddToTriple(generalUris.LanguageProperty, info.IsEnglishSite ? "en" : info.Language); } else { return(false); } } else { r.MoveToElement(); ConsoleHelper.WriteErrorLine("No Address attribute found on element {0}. Skipping element.", r.ReadOuterXml()); return(false); } w.AddToTriple(generalUris.TypeProperty, generalUris.SiteInfoType); w.AddToTriple(generalUris.WebsiteProperty, new Uri("http://" + address)); if (r.MoveToAttribute("Name")) { w.AddToTriple(generalUris.LabelProperty, r.Value); w.AddToTriple(generalUris.TitleProperty, r.Value); } if (r.MoveToAttribute("Description")) { w.AddToTriple(generalUris.DescriptionProperty, r.Value); } if (r.MoveToAttribute("ParentAddress")) { SiteInfo parentInfo; if (GlobalData.Sites.TryGetValue(r.Value, out parentInfo)) { w.AddToTriple(generalUris.ParentSiteProperty, generalUris.CreateSiteUri(parentInfo)); } else { ConsoleHelper.WriteWarningLine("Unknown parent site {0}; skipping information.", r.Value); } } return(true); }
private static void LinkToPost(SiteUris uris, Uri subjectUri, XmlReader r, SequentialTurtleWriter w) { if (r.MoveToAttribute("PostId")) { w.StartTriple(uris.CreatePostUri(r.Value)); w.AddToTriple(uris.GeneralUris.EventProperty, subjectUri); w.StartTriple(subjectUri); } else { ConsoleHelper.WriteWarningLine("Orphaned post history item: {0}", subjectUri.AbsoluteUri); } }
private static void ConvertUsers(SiteUris uris, XmlReader r, SequentialTurtleWriter w) { var malformedIris = new List <string>(); long totalMalformedIriCount = 0; while (r.Read()) { switch (r.NodeType) { case XmlNodeType.Element: switch (r.LocalName) { case "row": using (var subR = r.ReadSubtree()) { subR.Read(); ConvertUser(uris, subR, w, malformedIris, ref totalMalformedIriCount); } break; } break; case XmlNodeType.EndElement: if (totalMalformedIriCount > 0) { string example; if (malformedIris.Count > 0) { var exampleBuilder = new System.Text.StringBuilder(" (e.g. "); for (int i = 0; i < malformedIris.Count; i++) { if (i > 0) { exampleBuilder.Append("; "); } exampleBuilder.Append(malformedIris[i]); } exampleBuilder.Append(")"); example = exampleBuilder.ToString(); } else { example = ""; } ConsoleHelper.WriteWarningLine("{1} malformed URL(s) found{0}, treated as string literals.", example, totalMalformedIriCount); } return; } } }
/// <summary> /// Downloads the file to the local file system. /// </summary> /// <param name="baseUri">The base URI that the relative file name can be expanded from.</param> /// <param name="directory">The destination directory.</param> /// <returns>The path to the downloaded file.</returns> /// <exception cref="ArgumentNullException">Any of the arguments is <see langword="null"/>.</exception> /// <remarks> /// <para>This method downloads the file to the local file system. /// After downloading, the MD5 hash of the file contents is verified.</para> /// </remarks> public string Download(Uri baseUri, string directory) { if (baseUri == null) { throw new ArgumentNullException("baseUri"); } string fn = Path.Combine(directory, name); ConsoleHelper.WriteMilestone("Downloading {0} ...", name); using (var client = new WebClient()) { client.DownloadFile(new Uri(baseUri, name), fn); } System.Console.WriteLine(" done."); var fInfo = new FileInfo(fn); long fileSize = fInfo.Length; if (fileSize == this.size) { ConsoleHelper.WriteSuccessLine("File size of {0} bytes verified.", fileSize); } else { ConsoleHelper.WriteWarningLine("File size is {0} bytes, which differs from the expected size of {1} bytes.", fileSize, this.size); } byte[] hash; using (var algo = MD5.Create()) { using (var fs = fInfo.OpenRead()) { hash = algo.ComputeHash(fs); } } string fileMD5 = string.Join("", hash.Select(b => string.Format(System.Globalization.CultureInfo.InvariantCulture, "{0:X2}", b))).ToLowerInvariant(); if (fileMD5 == this.md5) { ConsoleHelper.WriteSuccessLine("MD5 hash ({0}) verified.", fileMD5); } else { ConsoleHelper.WriteWarningLine("MD5 hash of file ({0}) does not match the expected one ({1}).", fileMD5, this.md5); } return(fn); }
private static void ConvertComment(SiteUris uris, XmlReader r, SequentialTurtleWriter w) { Uri subjectUri; if (r.MoveToAttribute("Id")) { subjectUri = uris.CreateCommentUri(r.Value); w.StartTriple(subjectUri); } else { r.MoveToElement(); ConsoleHelper.WriteErrorLine("No Id attribute found on element {0}. Skipping element.", r.ReadOuterXml()); return; } w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.CommentType); uris.LinkToSite(w); if (r.MoveToAttribute("Score")) { w.AddToTriple(uris.GeneralUris.ScoreProperty, long.Parse(r.Value)); } if (r.MoveToAttribute("Text")) { w.AddToTriple(uris.GeneralUris.DescriptionProperty, r.Value); } if (r.MoveToAttribute("CreationDate")) { w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture)); } if (r.MoveToAttribute("UserId")) { w.AddToTriple(uris.GeneralUris.OwnerProperty, uris.CreateUserUri(r.Value)); } if (r.MoveToAttribute("PostId")) { w.StartTriple(uris.CreatePostUri(r.Value)); w.AddToTriple(uris.GeneralUris.CommentProperty, subjectUri); w.StartTriple(subjectUri); } else { ConsoleHelper.WriteWarningLine("Orphaned comment: {0}", subjectUri); } }
private static void ConvertSiteList(GeneralUris generalUris, string tempDir, Uri baseUri, string destDir, VDS.RDF.INamespaceMapper nsMapper) { string srcFile = Path.Combine(tempDir, "Sites.xml"); ConsoleHelper.WriteMilestone("Downloading site list ..."); using (var client = new WebClient()) { client.DownloadFile(new Uri(baseUri, "Sites.xml"), srcFile); } Console.WriteLine(" done."); using (var destWriter = new SequentialTurtleWriter(File.CreateText(Path.Combine(destDir, "_sites.ttl")), nsMapper)) { using (var fs = File.OpenRead(srcFile)) { using (var reader = XmlReader.Create(fs)) { while (reader.NodeType != XmlNodeType.Element) { if (!reader.Read()) { ConsoleHelper.WriteErrorLine("No contents found in file {0}.", srcFile); return; } } if (reader.LocalName == "sitelist") { ConvertSites(generalUris, reader, destWriter); } else { ConsoleHelper.WriteWarningLine("Unknown root element \"{0}\". Skipping document.", reader.LocalName); } } } GlobalData.UpdateStats(destWriter); } Console.WriteLine("Conversion of site list completed."); }
private static void ConvertPostHistoryItem(SiteUris uris, XmlReader r, SequentialTurtleWriter w, UnknownValueStore <string> unknownPostHistoryTypeIds) { Uri subjectUri; if (r.MoveToAttribute("Id")) { subjectUri = uris.CreatePostHistoryUri(r.Value); w.StartTriple(subjectUri); } else { r.MoveToElement(); ConsoleHelper.WriteErrorLine("No Id attribute found on element {0}. Skipping element.", r.ReadOuterXml()); return; } if (r.MoveToAttribute("PostHistoryTypeId")) { switch (r.Value) { case "1": // initial title break; case "2": // initial body break; case "3": // initial tags break; case "4": // edit title break; case "5": // edit body break; case "6": // edit tags break; case "7": // rollback title break; case "8": // rollback body break; case "9": // rollback tags break; case "10": // post closed w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostClosureType); if (r.MoveToAttribute("Comment")) { switch (r.Value) { case "1": // Exact Duplicate case "101": // duplicate w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.DuplicateCloseReason); break; case "2": // Off-topic case "102": // Off-topic w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.OffTopicCloseReason); break; case "3": // Subjective and argumentative case "105": // Primarily opinion-based w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.SubjectiveCloseReason); break; case "4": // Not a real question w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.NotAQuestionCloseReason); break; case "7": // Too localized w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.TooLocalizedCloseReason); break; case "10": // General reference w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.GeneralReferenceCloseReason); break; case "20": // Noise or pointless w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.NoiseCloseReason); break; case "103": // Unclear what you're asking w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.UnclearCloseReason); break; case "104": // Too broad w.AddToTriple(uris.GeneralUris.CloseReasonProperty, uris.GeneralUris.TooBroadCloseReason); break; default: ConsoleHelper.WriteWarningLine("Unknown post close reason: {0}", r.Value); break; } } if (r.MoveToAttribute("CreationDate")) { w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture)); } AddParticipants(uris, r, w); LinkToPost(uris, subjectUri, r, w); break; case "11": // post reopened w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostReopeningType); if (r.MoveToAttribute("CreationDate")) { w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture)); } AddParticipants(uris, r, w); LinkToPost(uris, subjectUri, r, w); break; case "12": // post deleted w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostDeletionType); if (r.MoveToAttribute("CreationDate")) { w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture)); } AddParticipants(uris, r, w); LinkToPost(uris, subjectUri, r, w); break; case "13": // post undeleted w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostUndeletionType); if (r.MoveToAttribute("CreationDate")) { w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture)); } AddParticipants(uris, r, w); LinkToPost(uris, subjectUri, r, w); break; case "14": // post locked w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostLockingType); if (r.MoveToAttribute("CreationDate")) { w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture)); } AddParticipants(uris, r, w); LinkToPost(uris, subjectUri, r, w); break; case "15": // post unlocked w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostUnlockingType); if (r.MoveToAttribute("CreationDate")) { w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture)); } AddParticipants(uris, r, w); LinkToPost(uris, subjectUri, r, w); break; case "16": // community owned break; case "17": // post migrated superseded with 35/36 break; case "18": // question merged break; case "19": // question protected w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostProtectionType); if (r.MoveToAttribute("CreationDate")) { w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture)); } AddParticipants(uris, r, w); LinkToPost(uris, subjectUri, r, w); break; case "20": // question unprotected w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.PostUnprotectionType); if (r.MoveToAttribute("CreationDate")) { w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture)); } AddParticipants(uris, r, w); LinkToPost(uris, subjectUri, r, w); break; case "21": // post disassociated break; case "22": // question unmerged break; case "24": // suggested edit applied break; case "25": // post tweeted break; case "31": // comment discussion moved to chat break; case "33": // post notice added break; case "34": // post notice removed break; case "35": // post migrated away replaces 17 break; case "36": // post migrated here replaces 17 break; case "37": // post merge source break; case "38": // post merge destination break; default: unknownPostHistoryTypeIds.RegisterUnknownValue(r.Value); break; } } else { r.MoveToElement(); ConsoleHelper.WriteErrorLine("No PostHistoryTypeId attribute found on element {0}. Skipping element.", r.ReadOuterXml()); } }
public static void Convert(GeneralUris generalUris, string srcFile, string destDir, SiteInfo website) { string fileNameOnly = Path.GetFileName(srcFile); Console.WriteLine("Processing {0} ...", fileNameOnly); var siteUris = new SiteUris(generalUris, website); var nsMapper = siteUris.CreateNamespaceMapper(); using (var destWriter = new SequentialTurtleWriter(File.CreateText(Path.Combine(destDir, website + "-" + Path.GetFileNameWithoutExtension(srcFile) + ".ttl")), nsMapper)) { using (var fs = File.OpenRead(srcFile)) { using (var reader = XmlReader.Create(fs)) { while (reader.NodeType != XmlNodeType.Element) { if (!reader.Read()) { ConsoleHelper.WriteErrorLine("No contents found in file {0}.", srcFile); return; } } switch (reader.LocalName) { case "badges": ConsoleHelper.WriteInfoLine("List of badges identified."); ConvertBadges(siteUris, reader, destWriter); break; case "comments": ConsoleHelper.WriteInfoLine("List of comments identified."); ConvertComments(siteUris, reader, destWriter); break; case "posthistory": ConsoleHelper.WriteInfoLine("List of posthistory identified."); ConvertPostHistory(siteUris, reader, destWriter); break; case "postlinks": ConsoleHelper.WriteInfoLine("List of postlinks identified."); ConvertPostLinks(siteUris, reader, destWriter); break; case "posts": ConsoleHelper.WriteInfoLine("List of posts identified."); ConvertPosts(siteUris, reader, destWriter); break; case "tags": ConsoleHelper.WriteInfoLine("List of tags identified."); ConvertTags(siteUris, reader, destWriter); break; case "users": ConsoleHelper.WriteInfoLine("List of users identified."); ConvertUsers(siteUris, reader, destWriter); break; case "votes": ConsoleHelper.WriteInfoLine("List of votes identified."); ConvertVotes(siteUris, reader, destWriter); break; default: ConsoleHelper.WriteWarningLine("Unknown root element \"{0}\". Skipping document.", reader.LocalName); break; } } } GlobalData.UpdateStats(destWriter); } Console.WriteLine("Conversion of {0} completed.", fileNameOnly); }
private static void ConvertPost(SiteUris uris, XmlReader r, SequentialTurtleWriter w, UnknownValueStore <string> unknownPostTypeIds) { Uri subjectUri; if (r.MoveToAttribute("Id")) { subjectUri = uris.CreatePostUri(r.Value); w.StartTriple(subjectUri); } else { r.MoveToElement(); ConsoleHelper.WriteErrorLine("No Id attribute found on element {0}. Skipping element.", r.ReadOuterXml()); return; } if (r.MoveToAttribute("PostTypeId")) { switch (r.Value) { case "1": // question w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.QuestionType); uris.LinkToSite(w); if (r.MoveToAttribute("AcceptedAnswerId")) { w.AddToTriple(uris.GeneralUris.AcceptedAnswerProperty, uris.CreatePostUri(r.Value)); } if (r.MoveToAttribute("ViewCount")) { w.AddToTriple(uris.GeneralUris.ViewCountProperty, long.Parse(r.Value)); } if (r.MoveToAttribute("Title")) { w.AddToTriple(uris.GeneralUris.TitleProperty, r.Value); w.AddToTriple(uris.GeneralUris.LabelProperty, r.Value); } if (r.MoveToAttribute("Score")) { w.AddToTriple(uris.GeneralUris.ScoreProperty, long.Parse(r.Value)); } break; case "2": // answer w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.AnswerType); uris.LinkToSite(w); if (r.MoveToAttribute("ParentId")) { w.StartTriple(uris.CreatePostUri(r.Value)); w.AddToTriple(uris.GeneralUris.AnswerProperty, subjectUri); w.StartTriple(subjectUri); } else { ConsoleHelper.WriteWarningLine("Orphaned answer: {0}", subjectUri); } if (r.MoveToAttribute("Score")) { w.AddToTriple(uris.GeneralUris.ScoreProperty, long.Parse(r.Value)); } break; case "3": // orphaned tag wiki break; case "4": // tag info excerpt w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.TagExcerptType); break; case "5": // tag description w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.TagDescriptionType); break; case "6": // moderator nomination break; case "7": // "Wiki placeholder" (seems to only be the election description) //w.AddToTriple(uris.GeneralUris.TypeProperty, uris.GeneralUris.SiteInfoType); break; default: unknownPostTypeIds.RegisterUnknownValue(r.Value); break; } if (r.MoveToAttribute("CreationDate")) { w.AddToTriple(uris.GeneralUris.DateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture)); } if (r.MoveToAttribute("LastEditDate")) { w.AddToTriple(uris.GeneralUris.LastEditDateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture)); } if (r.MoveToAttribute("LastActivity")) { w.AddToTriple(uris.GeneralUris.LastActivityDateProperty, DateTime.Parse(r.Value, System.Globalization.CultureInfo.InvariantCulture)); } if (r.MoveToAttribute("OwnerUserId")) { w.AddToTriple(uris.GeneralUris.OwnerProperty, uris.CreateUserUri(r.Value)); } // TODO: LastEditorUserId (given in post history) // TODO: FavoriteCount (linked to users?) if (r.MoveToAttribute("Body")) { w.AddToTriple(uris.GeneralUris.DescriptionProperty, r.Value); } if (r.MoveToAttribute("Tags")) { w.AddToTriple(uris.GeneralUris.TagProperty, tagRegex.Matches(r.Value).Cast <Match>().Select(m => uris.CreateTagUri(m.Groups[1].Value))); } } else { r.MoveToElement(); ConsoleHelper.WriteErrorLine("No PostTypeId attribute found on element {0}. Skipping element.", r.ReadOuterXml()); } }
/// <summary> /// Downloads and converts the data. /// </summary> /// <param name="generalUris">An object that provides general URIs used in the exported dataset.</param> /// <param name="filelist">The URL of a filelist Xml file.</param> /// <exception cref="ArgumentNullException">Any of the arguments is <see langword="null"/>.</exception> private static void RetrieveData(GeneralUris generalUris, Uri filelist) { if (generalUris == null) { throw new ArgumentNullException("generalUris"); } if (filelist == null) { throw new ArgumentNullException("filelist"); } string tempDir = Path.Combine(BaseDir, "tmp"); string destDir = Path.Combine(BaseDir, "rdf"); Directory.CreateDirectory(tempDir); Directory.CreateDirectory(destDir); DateTime startTime = DateTime.Now; ConsoleHelper.WriteInfoLine("Current time: {0:yyyy-MM-dd HH:mm:ss}", startTime); if (!GlobalData.Options.OntologyOnly) { ConsoleHelper.WriteMilestone("Downloading files list ..."); using (var client = new WebClient()) { client.DownloadFile(filelist, Path.Combine(tempDir, "files.xml")); } Console.WriteLine(" done."); var files = LoadFilesList(Path.Combine(tempDir, "files.xml")).OrderBy(f => f).ToArray(); ConsoleHelper.WriteInfoLine("{0} file(s) in list, totalling to a compressed size of {1:F1} GB.", files.Length, (double)files.Sum(f => f.Size) / 1024 / 1024 / 1024); int processedFilesCount = 0; foreach (var f in files) { SiteInfo siteInfo; try { siteInfo = f.RetrieveSiteInfo(); } catch (ArgumentException ex) { ConsoleHelper.WriteErrorLine("Skipping file {0}, as it cannot be associated with a website.\n{1}", f, ex); continue; } if (IncludeSite(siteInfo)) { GlobalData.Sites[siteInfo.Id] = siteInfo; if (!GlobalData.Options.SiteListOnly) { string fn = f.Download(filelist, tempDir); string[] rawFiles = null; switch (Path.GetExtension(fn)) { case ".7z": rawFiles = ExtractSevenZipArchive(fn); break; default: ConsoleHelper.WriteWarningLine("File {0} has an unknown file extension.", fn); break; } if (rawFiles != null) { ConsoleHelper.WriteInfoLine("{0} file(s) extracted.", rawFiles.Length); foreach (var rawFile in rawFiles) { Converter.Convert(generalUris, rawFile, destDir, siteInfo); } } } processedFilesCount++; if (processedFilesCount >= GlobalData.Options.MaxFileCount) { break; } } } } GlobalInformationConverter.Convert(generalUris, tempDir, filelist, destDir); if (!GlobalData.Options.KeepTemporaryFiles) { Console.Write("Removing temporary files ... "); try { Directory.Delete(tempDir, true); Console.WriteLine(" done."); } catch { ConsoleHelper.WriteErrorLine("Please remove the directory {0} manually.", tempDir); } } Console.WriteLine(); GlobalData.PrintStats(); DateTime endTime = DateTime.Now; ConsoleHelper.WriteInfoLine("Current time: {0:yyyy-MM-dd HH:mm:ss}", endTime); ConsoleHelper.WriteInfoLine("Total duration: {0}", endTime - startTime); }