public static void Convert(GeneralUris generalUris, string tempDir, Uri baseUri, string destDir) { if (tempDir == null) { throw new ArgumentNullException("tempDir"); } if (baseUri == null) { throw new ArgumentNullException("baseUri"); } if (destDir == null) { throw new ArgumentNullException("destDir"); } var nsMapper = generalUris.CreateNamespaceMapper(); WriteOntology(generalUris, destDir, nsMapper); if (!GlobalData.Options.OntologyOnly) { ConvertSiteList(generalUris, tempDir, baseUri, destDir, nsMapper); if (!GlobalData.Options.SiteListOnly) { WriteAccountList(generalUris, destDir, nsMapper); WriteBadgesLists(generalUris, destDir, nsMapper); WriteConstants(generalUris, destDir, nsMapper); } } }
private static void ConvertSites(GeneralUris generalUris, XmlReader r, SequentialTurtleWriter w) { long skipped = 0; while (r.Read()) { switch (r.NodeType) { case XmlNodeType.Element: switch (r.LocalName) { case "row": using (var subR = r.ReadSubtree()) { subR.Read(); if (!ConvertSite(generalUris, subR, w)) { skipped++; } } break; } break; case XmlNodeType.EndElement: return; } } if (skipped > 0) { ConsoleHelper.WriteWarningLine("{0} items from the list of sites were skipped.", skipped); } }
public static void Main(string[] args) { var culture = new System.Globalization.CultureInfo("en-US"); System.Threading.Thread.CurrentThread.CurrentCulture = culture; System.Threading.Thread.CurrentThread.CurrentUICulture = culture; try { using (var parser = new CommandLine.Parser()) { if (!parser.ParseArguments(args, GlobalData.Options)) { Console.WriteLine(GlobalData.Options.GetUsage()); return; } } if (GlobalData.Options.MaxFileCount < 1) { ConsoleHelper.WriteErrorLine("As the maximum file count is set to {0}, no files can be imported.", GlobalData.Options.MaxFileCount); return; } if (GlobalData.Options.Files.Count < 1) { ConsoleHelper.WriteInfoLine("No input file specified. Using default URL {0}.", DefaultFileUrl); GlobalData.Options.Files.Add(DefaultFileUrl); } if (!string.IsNullOrWhiteSpace(GlobalData.Options.SiteNamePattern)) { try { GlobalData.SiteNamePattern = new Regex(GlobalData.Options.SiteNamePattern); } catch (ArgumentException) { ConsoleHelper.WriteErrorLine("Invalid regular expression: \"{0}\"", GlobalData.Options.SiteNamePattern); return; } } GeneralUris generalUris = new GeneralUris(GlobalData.Options.GeneralPrefix); try { Uri filelist = new Uri(GlobalData.Options.Files[0]); RetrieveData(generalUris, filelist); } catch (UriFormatException) { Console.WriteLine("Invalid filelist URL:"); throw; } } catch (Exception ex) { Console.WriteLine(ex.ToString()); } }
private static void WriteConstants(GeneralUris generalUris, string destDir, VDS.RDF.INamespaceMapper nsMapper) { ConsoleHelper.WriteMilestone("Writing constant definitions ..."); using (var destWriter = new SequentialTurtleWriter(File.CreateText(Path.Combine(destDir, "_constants.ttl")), nsMapper)) { WriteCloseReasons(generalUris, destWriter); GlobalData.UpdateStats(destWriter); } Console.WriteLine(" done."); }
private static bool ConvertSite(GeneralUris generalUris, XmlReader r, SequentialTurtleWriter w) { Uri subjectUri; string address; if (r.MoveToAttribute("Address")) { SiteInfo info; if (GlobalData.Sites.TryGetValue(r.Value, out info)) { address = r.Value; subjectUri = generalUris.CreateSiteUri(info); w.StartTriple(subjectUri); w.AddToTriple(generalUris.IsMetaSiteProperty, info.IsMetaSite); w.AddToTriple(generalUris.LanguageProperty, info.IsEnglishSite ? "en" : info.Language); } else { return(false); } } else { r.MoveToElement(); ConsoleHelper.WriteErrorLine("No Address attribute found on element {0}. Skipping element.", r.ReadOuterXml()); return(false); } w.AddToTriple(generalUris.TypeProperty, generalUris.SiteInfoType); w.AddToTriple(generalUris.WebsiteProperty, new Uri("http://" + address)); if (r.MoveToAttribute("Name")) { w.AddToTriple(generalUris.LabelProperty, r.Value); w.AddToTriple(generalUris.TitleProperty, r.Value); } if (r.MoveToAttribute("Description")) { w.AddToTriple(generalUris.DescriptionProperty, r.Value); } if (r.MoveToAttribute("ParentAddress")) { SiteInfo parentInfo; if (GlobalData.Sites.TryGetValue(r.Value, out parentInfo)) { w.AddToTriple(generalUris.ParentSiteProperty, generalUris.CreateSiteUri(parentInfo)); } else { ConsoleHelper.WriteWarningLine("Unknown parent site {0}; skipping information.", r.Value); } } return(true); }
private static void WriteCloseReasons(GeneralUris generalUris, SequentialTurtleWriter w) { WriteCloseReason(generalUris, w, generalUris.DuplicateCloseReason, "Duplicate"); WriteCloseReason(generalUris, w, generalUris.OffTopicCloseReason, "Off-topic"); WriteCloseReason(generalUris, w, generalUris.SubjectiveCloseReason, "Opinion-based"); WriteCloseReason(generalUris, w, generalUris.NotAQuestionCloseReason, "Not a real question"); WriteCloseReason(generalUris, w, generalUris.TooLocalizedCloseReason, "Too localized"); WriteCloseReason(generalUris, w, generalUris.GeneralReferenceCloseReason, "General reference"); WriteCloseReason(generalUris, w, generalUris.NoiseCloseReason, "Pointless/Noise"); WriteCloseReason(generalUris, w, generalUris.UnclearCloseReason, "Unclear what you're asking"); WriteCloseReason(generalUris, w, generalUris.TooBroadCloseReason, "Too broad"); }
private static void WriteAccountList(GeneralUris generalUris, string destDir, VDS.RDF.INamespaceMapper nsMapper) { ConsoleHelper.WriteMilestone("Writing account list ..."); using (var destWriter = new SequentialTurtleWriter(File.CreateText(Path.Combine(destDir, "_users.ttl")), nsMapper)) { foreach (var id in GlobalData.AccountIds) { destWriter.StartTriple(generalUris.CreateAccountUri(id)); destWriter.AddToTriple(generalUris.TypeProperty, generalUris.AccountType); } GlobalData.UpdateStats(destWriter); } Console.WriteLine(" done."); }
private static void WriteOntology(GeneralUris generalUris, string destDir, VDS.RDF.INamespaceMapper nsMapper) { ConsoleHelper.WriteMilestone("Writing ontology ..."); using (var tempNsMapper = new NamespaceMapper(false)) { tempNsMapper.Import(nsMapper); tempNsMapper.AddNamespace("owl", new Uri(NamespaceMapper.OWL)); using (var destWriter = new SequentialTurtleWriter(File.CreateText(Path.Combine(destDir, "_ontology.ttl")), tempNsMapper)) { WriteOntologyDefinitions(generalUris, destWriter); GlobalData.UpdateStats(destWriter); } } Console.WriteLine(" done."); }
private static void WriteBadgesLists(GeneralUris generalUris, string destDir, VDS.RDF.INamespaceMapper nsMapper) { ConsoleHelper.WriteMilestone("Writing lists of badges ..."); using (var destWriter = new SequentialTurtleWriter(File.CreateText(Path.Combine(destDir, "_badges.ttl")), nsMapper)) { foreach (var siteBadges in GlobalData.GetBadgesPerSite()) { Uri siteUri = generalUris.CreateSiteUri(siteBadges.Item1); SiteUris uris = new SiteUris(generalUris, siteBadges.Item1); foreach (string badgeName in siteBadges.Item2) { WriteBadgeInfo(uris, badgeName, destWriter); } } GlobalData.UpdateStats(destWriter); } Console.WriteLine(" done."); }
public SiteUris(GeneralUris generalUris, SiteInfo site) { if (generalUris == null) { throw new ArgumentNullException("generalUris"); } if (site == null) { throw new ArgumentNullException("site"); } this.generalUris = generalUris; this.site = site; stackExchangeSite = generalUris.CreateSiteUri(site); BaseUri = new Uri(generalUris.SiteDataPrefix, site.Name + (site.IsMetaSite ? "-meta" : "") + "/"); tagPrefix = new Uri(BaseUri, "tag/"); badgePrefix = new Uri(BaseUri, "badge/"); }
private static void ConvertSiteList(GeneralUris generalUris, string tempDir, Uri baseUri, string destDir, VDS.RDF.INamespaceMapper nsMapper) { string srcFile = Path.Combine(tempDir, "Sites.xml"); ConsoleHelper.WriteMilestone("Downloading site list ..."); using (var client = new WebClient()) { client.DownloadFile(new Uri(baseUri, "Sites.xml"), srcFile); } Console.WriteLine(" done."); using (var destWriter = new SequentialTurtleWriter(File.CreateText(Path.Combine(destDir, "_sites.ttl")), nsMapper)) { using (var fs = File.OpenRead(srcFile)) { using (var reader = XmlReader.Create(fs)) { while (reader.NodeType != XmlNodeType.Element) { if (!reader.Read()) { ConsoleHelper.WriteErrorLine("No contents found in file {0}.", srcFile); return; } } if (reader.LocalName == "sitelist") { ConvertSites(generalUris, reader, destWriter); } else { ConsoleHelper.WriteWarningLine("Unknown root element \"{0}\". Skipping document.", reader.LocalName); } } } GlobalData.UpdateStats(destWriter); } Console.WriteLine("Conversion of site list completed."); }
private static void WriteOntologyDefinitions(GeneralUris generalUris, SequentialTurtleWriter w) { // ontology metadata string ontologyUri = generalUris.OntologyPrefix.AbsoluteUri ?? ""; if (ontologyUri.Length > 0) { switch (ontologyUri[ontologyUri.Length - 1]) { case '/': case '#': ontologyUri = ontologyUri.Substring(0, ontologyUri.Length - 1); break; } } w.StartTriple(new Uri(ontologyUri)); w.AddToTriple(generalUris.TypeProperty, new Uri(NamespaceMapper.OWL + "Ontology")); w.AddToTriple(generalUris.TitleProperty, "SE2Rdf Output"); w.AddToTriple(generalUris.DateProperty, DateTime.Now); w.AddToTriple(new Uri(NamespaceMapper.OWL + "imports"), new Uri("http://purl.org/dc/elements/1.1")); // TODO: is this correct/requierd? // types Uri postType = new Uri(generalUris.OntologyPrefix.AbsoluteUri + "Post"); WriteClassDecl(w, generalUris.QuestionType, "Question"); w.AddToTriple(subClassOfUri, postType); w.AddAnonymousToTriple(subClassOfUri); // TODO: does not seem to work yet in VOWL => test in Protege //w.AddToTriple(generalUris.TypeProperty, restrictionUri); w.AddToTriple(new Uri(NamespaceMapper.OWL + "onProperty"), generalUris.TitleProperty); w.AddToTriple(new Uri(NamespaceMapper.OWL + "cardinality"), 1); w.FinishAnonymousNode(); WriteClassDecl(w, generalUris.AnswerType, "Answer"); w.AddToTriple(subClassOfUri, postType); Uri tagWikiType = new Uri(generalUris.OntologyPrefix.AbsoluteUri + "TagWiki"); WriteClassDecl(w, generalUris.TagExcerptType, "Tag Excerpt"); w.AddToTriple(subClassOfUri, tagWikiType); WriteClassDecl(w, generalUris.TagDescriptionType, "Tag Description"); w.AddToTriple(subClassOfUri, tagWikiType); WriteClassDecl(w, generalUris.SiteInfoType, "Q&A Site"); WriteClassDecl(w, generalUris.UserType, "Site-specific User"); WriteClassDecl(w, generalUris.AccountType, "Account"); w.AddToTriple(subClassOfUri, generalUris.PersonType); WriteClassDecl(w, generalUris.CommentType, "Comment"); WriteClassDecl(w, generalUris.TagType, "Tag"); WriteClassDecl(w, generalUris.AcceptanceType, "Acceptance"); Uri voteType = new Uri(generalUris.OntologyPrefix.AbsoluteUri + "Vote"); WriteClassDecl(w, voteType, "Vote"); WriteClassDecl(w, generalUris.UpVoteType, "Upvote"); w.AddToTriple(subClassOfUri, voteType); WriteClassDecl(w, generalUris.DownVoteType, "Downvote"); w.AddToTriple(subClassOfUri, voteType); WriteClassDecl(w, generalUris.FavoriteType, "Favorite"); WriteClassDecl(w, generalUris.BadgeType, "Badge"); WriteClassDecl(w, generalUris.AssignedBadgeType, "Assigned Badge"); Uri postActionType = new Uri(generalUris.OntologyPrefix.AbsoluteUri + "PostAction"); WriteClassDecl(w, postActionType, "Post Action"); WriteClassDecl(w, generalUris.PostClosureType, "Closure"); w.AddToTriple(subClassOfUri, postActionType); WriteClassDecl(w, generalUris.PostReopeningType, "Reopening"); w.AddToTriple(subClassOfUri, postActionType); WriteClassDecl(w, generalUris.PostDeletionType, "Deletion"); w.AddToTriple(subClassOfUri, postActionType); WriteClassDecl(w, generalUris.PostUndeletionType, "Undeletion"); w.AddToTriple(subClassOfUri, postActionType); WriteClassDecl(w, generalUris.PostLockingType, "Locking"); w.AddToTriple(subClassOfUri, postActionType); WriteClassDecl(w, generalUris.PostUnlockingType, "Unlocking"); w.AddToTriple(subClassOfUri, postActionType); WriteClassDecl(w, generalUris.PostProtectionType, "Protection"); w.AddToTriple(subClassOfUri, postActionType); WriteClassDecl(w, generalUris.PostUnprotectionType, "Unprotection"); w.AddToTriple(subClassOfUri, postActionType); WriteClassDecl(w, generalUris.StartOfBountyType, "Start of Bounty"); WriteClassDecl(w, generalUris.EndOfBountyType, "End of Bounty"); WriteClassDecl(w, generalUris.CloseReasonType, "Close Reason"); // properties WritePropDecl(w, generalUris.StackExchangeWebsiteProperty, true, new[] { postType, generalUris.TagType, generalUris.UserType, generalUris.BadgeType }, new[] { generalUris.SiteInfoType }); WritePropDecl(w, generalUris.ScoreProperty, false, new[] { generalUris.CommentType, postType }, new[] { new Uri(NamespaceMapper.XMLSCHEMA + "integer") }); WritePropDecl(w, generalUris.OwnerProperty, true, new[] { generalUris.CommentType, postType, tagWikiType }, new[] { generalUris.UserType }); WritePropDecl(w, generalUris.CloseReasonProperty, true, new[] { generalUris.PostClosureType }, new[] { generalUris.CloseReasonType }); WritePropDecl(w, generalUris.ParticipantProperty, true, new[] { postActionType }, new[] { generalUris.UserType }); WritePropDecl(w, generalUris.CommentProperty, true, new[] { postType }, new[] { generalUris.CommentType }); WritePropDecl(w, generalUris.ViewCountProperty, false, new[] { generalUris.QuestionType, generalUris.UserType, generalUris.AccountType }, new[] { new Uri(NamespaceMapper.XMLSCHEMA + "integer") }); WritePropDecl(w, generalUris.TagProperty, true, new[] { postType }, // TODO: verify! new[] { generalUris.TagType }); WritePropDecl(w, generalUris.AnswerProperty, true, new[] { generalUris.QuestionType, generalUris.EndOfBountyType }, new[] { generalUris.AnswerType }); WritePropDecl(w, generalUris.AcceptedAnswerProperty, true, new[] { generalUris.QuestionType }, new[] { generalUris.AnswerType }); w.AddToTriple(subPropertyOfUri, generalUris.AnswerProperty); WritePropDecl(w, generalUris.LastEditDateProperty, false, new[] { postType, tagWikiType }, new[] { new Uri(NamespaceMapper.XMLSCHEMA + "dateTime") }); WritePropDecl(w, generalUris.DuplicateProperty, true, new[] { generalUris.QuestionType }, new[] { generalUris.QuestionType }, new Uri(NamespaceMapper.OWL + "IrreflexiveProperty")); WritePropDecl(w, generalUris.EventProperty, true, new[] { postType }, new[] { postActionType }); WritePropDecl(w, generalUris.TagExcerptProperty, true, new[] { generalUris.TagType }, new[] { generalUris.TagExcerptType }); WritePropDecl(w, generalUris.TagDescriptionProperty, true, new[] { generalUris.TagType }, new[] { generalUris.TagDescriptionType }); WritePropDecl(w, generalUris.BadgeProperty, true, new[] { generalUris.UserType }, new[] { generalUris.BadgeType }); WritePropDecl(w, generalUris.ReputationProperty, false, new[] { generalUris.UserType }, new[] { new Uri(NamespaceMapper.XMLSCHEMA + "integer") }); WritePropDecl(w, generalUris.UpVotesProperty, false, new[] { generalUris.UserType }, new[] { new Uri(NamespaceMapper.XMLSCHEMA + "integer") }); WritePropDecl(w, generalUris.DownVotesProperty, false, new[] { generalUris.UserType }, new[] { new Uri(NamespaceMapper.XMLSCHEMA + "integer") }); WritePropDecl(w, generalUris.AccountProperty, true, new[] { generalUris.UserType }, new[] { generalUris.AccountType }); w.AddToTriple(generalUris.TypeProperty, new Uri(NamespaceMapper.OWL + "FunctionalProperty")); WritePropDecl(w, generalUris.LastSeenProperty, false, new[] { generalUris.UserType }, new[] { new Uri(NamespaceMapper.XMLSCHEMA + "dateTime") }); WritePropDecl(w, generalUris.FavoriteProperty, true, new[] { generalUris.UserType }, new[] { generalUris.QuestionType }); WritePropDecl(w, generalUris.IsMetaSiteProperty, false, new[] { generalUris.SiteInfoType }, new[] { new Uri(NamespaceMapper.XMLSCHEMA + "boolean") }); WritePropDecl(w, generalUris.ParentSiteProperty, true, new[] { generalUris.SiteInfoType }, new[] { generalUris.SiteInfoType }, new Uri(NamespaceMapper.OWL + "IrreflexiveProperty")); WritePropDecl(w, generalUris.PostProperty, true, new[] { generalUris.StartOfBountyType }, new[] { generalUris.QuestionType }); WritePropDecl(w, generalUris.DonorProperty, true, new[] { generalUris.StartOfBountyType }, new[] { generalUris.UserType }); WritePropDecl(w, generalUris.OfferedAmountProperty, false, new[] { generalUris.StartOfBountyType }, new[] { new Uri(NamespaceMapper.XMLSCHEMA + "integer") }); WritePropDecl(w, generalUris.TransferredAmountProperty, false, new[] { generalUris.EndOfBountyType }, new[] { new Uri(NamespaceMapper.XMLSCHEMA + "integer") }); }
public static void Convert(GeneralUris generalUris, string srcFile, string destDir, SiteInfo website) { string fileNameOnly = Path.GetFileName(srcFile); Console.WriteLine("Processing {0} ...", fileNameOnly); var siteUris = new SiteUris(generalUris, website); var nsMapper = siteUris.CreateNamespaceMapper(); using (var destWriter = new SequentialTurtleWriter(File.CreateText(Path.Combine(destDir, website + "-" + Path.GetFileNameWithoutExtension(srcFile) + ".ttl")), nsMapper)) { using (var fs = File.OpenRead(srcFile)) { using (var reader = XmlReader.Create(fs)) { while (reader.NodeType != XmlNodeType.Element) { if (!reader.Read()) { ConsoleHelper.WriteErrorLine("No contents found in file {0}.", srcFile); return; } } switch (reader.LocalName) { case "badges": ConsoleHelper.WriteInfoLine("List of badges identified."); ConvertBadges(siteUris, reader, destWriter); break; case "comments": ConsoleHelper.WriteInfoLine("List of comments identified."); ConvertComments(siteUris, reader, destWriter); break; case "posthistory": ConsoleHelper.WriteInfoLine("List of posthistory identified."); ConvertPostHistory(siteUris, reader, destWriter); break; case "postlinks": ConsoleHelper.WriteInfoLine("List of postlinks identified."); ConvertPostLinks(siteUris, reader, destWriter); break; case "posts": ConsoleHelper.WriteInfoLine("List of posts identified."); ConvertPosts(siteUris, reader, destWriter); break; case "tags": ConsoleHelper.WriteInfoLine("List of tags identified."); ConvertTags(siteUris, reader, destWriter); break; case "users": ConsoleHelper.WriteInfoLine("List of users identified."); ConvertUsers(siteUris, reader, destWriter); break; case "votes": ConsoleHelper.WriteInfoLine("List of votes identified."); ConvertVotes(siteUris, reader, destWriter); break; default: ConsoleHelper.WriteWarningLine("Unknown root element \"{0}\". Skipping document.", reader.LocalName); break; } } } GlobalData.UpdateStats(destWriter); } Console.WriteLine("Conversion of {0} completed.", fileNameOnly); }
/// <summary> /// Downloads and converts the data. /// </summary> /// <param name="generalUris">An object that provides general URIs used in the exported dataset.</param> /// <param name="filelist">The URL of a filelist Xml file.</param> /// <exception cref="ArgumentNullException">Any of the arguments is <see langword="null"/>.</exception> private static void RetrieveData(GeneralUris generalUris, Uri filelist) { if (generalUris == null) { throw new ArgumentNullException("generalUris"); } if (filelist == null) { throw new ArgumentNullException("filelist"); } string tempDir = Path.Combine(BaseDir, "tmp"); string destDir = Path.Combine(BaseDir, "rdf"); Directory.CreateDirectory(tempDir); Directory.CreateDirectory(destDir); DateTime startTime = DateTime.Now; ConsoleHelper.WriteInfoLine("Current time: {0:yyyy-MM-dd HH:mm:ss}", startTime); if (!GlobalData.Options.OntologyOnly) { ConsoleHelper.WriteMilestone("Downloading files list ..."); using (var client = new WebClient()) { client.DownloadFile(filelist, Path.Combine(tempDir, "files.xml")); } Console.WriteLine(" done."); var files = LoadFilesList(Path.Combine(tempDir, "files.xml")).OrderBy(f => f).ToArray(); ConsoleHelper.WriteInfoLine("{0} file(s) in list, totalling to a compressed size of {1:F1} GB.", files.Length, (double)files.Sum(f => f.Size) / 1024 / 1024 / 1024); int processedFilesCount = 0; foreach (var f in files) { SiteInfo siteInfo; try { siteInfo = f.RetrieveSiteInfo(); } catch (ArgumentException ex) { ConsoleHelper.WriteErrorLine("Skipping file {0}, as it cannot be associated with a website.\n{1}", f, ex); continue; } if (IncludeSite(siteInfo)) { GlobalData.Sites[siteInfo.Id] = siteInfo; if (!GlobalData.Options.SiteListOnly) { string fn = f.Download(filelist, tempDir); string[] rawFiles = null; switch (Path.GetExtension(fn)) { case ".7z": rawFiles = ExtractSevenZipArchive(fn); break; default: ConsoleHelper.WriteWarningLine("File {0} has an unknown file extension.", fn); break; } if (rawFiles != null) { ConsoleHelper.WriteInfoLine("{0} file(s) extracted.", rawFiles.Length); foreach (var rawFile in rawFiles) { Converter.Convert(generalUris, rawFile, destDir, siteInfo); } } } processedFilesCount++; if (processedFilesCount >= GlobalData.Options.MaxFileCount) { break; } } } } GlobalInformationConverter.Convert(generalUris, tempDir, filelist, destDir); if (!GlobalData.Options.KeepTemporaryFiles) { Console.Write("Removing temporary files ... "); try { Directory.Delete(tempDir, true); Console.WriteLine(" done."); } catch { ConsoleHelper.WriteErrorLine("Please remove the directory {0} manually.", tempDir); } } Console.WriteLine(); GlobalData.PrintStats(); DateTime endTime = DateTime.Now; ConsoleHelper.WriteInfoLine("Current time: {0:yyyy-MM-dd HH:mm:ss}", endTime); ConsoleHelper.WriteInfoLine("Total duration: {0}", endTime - startTime); }
private static void WriteCloseReason(GeneralUris generalUris, SequentialTurtleWriter w, Uri reason, string name) { w.StartTriple(reason); w.AddToTriple(generalUris.TypeProperty, generalUris.CloseReasonType); w.AddToTriple(generalUris.LabelProperty, name); }