/// <summary> /// This method can be used for collecting data from API endpoints that return JSON arrays of User objects. /// These API endpoints are characterized by the /users/ route. /// Examples are users following a specific user, or the users that specific user is following. /// </summary> /// <param name="userName">The first parameter to the /users/ API endpoint.</param> /// <param name="route">The route modifier. Either "followers" or "following"</param> /// <returns></returns> private JsonArray GetAllFollowersOrFollowingsByUsername(string userName, string route) { string apiEndpoint = "https://api.scratch.mit.edu/users/" + userName + "/" + route + "?limit=40&offset={0}"; bool endOfDataReached = false; int offset = 0; JsonArray allUsers = new JsonArray(); while (!endOfDataReached) { string specifiedApiEndpoint = string.Format(apiEndpoint, offset); string returnedUsersJson = JSONGetter.GetAsJSONString(specifiedApiEndpoint); if (string.IsNullOrEmpty(returnedUsersJson)) { break; } var parsedUsers = JsonValue.Parse(returnedUsersJson); if (parsedUsers.Count == 0) { endOfDataReached = true; } foreach (var follower in parsedUsers) { allUsers.Add(follower.Value); } offset += 40; } return(allUsers); }
public void Scrape(int skip = 0) { string[] allURLSToConsider = new string[] { "https://api.scratch.mit.edu/explore/projects?limit=40&offset={0}&mode=trending&q=*", "https://api.scratch.mit.edu/explore/projects?limit=40&offset={0}&mode=popular&q=*", "https://api.scratch.mit.edu/explore/projects?limit=40&offset={0}&mode=recent&q=*", "https://api.scratch.mit.edu/search/projects?limit=40q=*&offset={0}", }; foreach (string URL in allURLSToConsider) { int offset = skip; string baseURL = URL; bool stopScraping = false; try { while (stopScraping != true) { Console.WriteLine("Scraping at offset: " + offset.ToString()); string specificURL = string.Format(baseURL, offset.ToString()); string rawJson = JSONGetter.GetAsJSONString(specificURL); if (string.IsNullOrEmpty(rawJson)) { Console.WriteLine("\t\tGetJSON2 returned null."); continue; } dynamic projectsObject = JsonValue.Parse(rawJson); List <ProjectAuthor> scrapedAuthors = new List <ProjectAuthor>(); foreach (var projectData in projectsObject) { string authorJson = GetAuthorJson(projectData["author"]["username"].ReadAs <string>()); ProjectAuthor projectAuthor = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectAuthor>(authorJson); scrapedAuthors.Add(projectAuthor); WriteAuthorToFile(projectAuthor.username, authorJson); } SaveAuthorsToDatabase(ProjectAuthorsToDatabaseEntities(scrapedAuthors.GroupBy(x => x.id).Select(y => y.First()).ToList())); offset += 40; if (offset > 9980) { stopScraping = true; } } } catch (Exception ex) { Console.WriteLine($"Exception ocurred: {ex.Message}"); offset += 40; continue; } } }
internal static void ProcessUnregisteredProjects(string pathToUnregisteredProjects, ProjectScraper p) { // Create a timer with a two second interval. System.Timers.Timer aTimer = new System.Timers.Timer(20000); // Hook up the Elapsed event for the timer. aTimer.Elapsed += (Object source, ElapsedEventArgs e) => { Say("Not done yet.."); }; aTimer.AutoReset = true; aTimer.Enabled = true; Say($"Enumerating unregistered project files in {Path.Combine(pathToUnregisteredProjects, "UnregisteredProjects.txt")}."); string[] unregisteredProjectIds = File.ReadAllLines(Path.Combine(pathToUnregisteredProjects, "UnregisteredProjects.txt")).Distinct <string>().ToArray(); Say($"Enumerating unregistered project files done"); aTimer.Stop(); aTimer.Start(); Say($"Enumerating existing project files in {pathToUnregisteredProjects}. This could take a very long time..."); string[] fileNames = Directory.GetFiles(pathToUnregisteredProjects).Select(o => Path.GetFileName(o)).ToArray(); Say($"Enumerating existing project files done."); aTimer.Stop(); aTimer.Start(); Say($"Creating projects cache. This could take a very long time..."); Dictionary <string, string> projectCache = new Dictionary <string, string>(fileNames.ToDictionary(x => x.Substring(0, x.IndexOf('.')), x => $".{x.Substring(x.IndexOf('.') + 1)}")); Say($"Creating projects cache done."); fileNames = null; //Otherwise, millions of strings will be hanging around for no reason. aTimer.Enabled = false; aTimer = null; GC.Collect(); using (ApplicationDatabase ctxt = new ApplicationDatabase()) { foreach (string projectId in unregisteredProjectIds) { if (!Int32.TryParse(projectId, out int projectIdAsInt)) { continue; } if (ctxt.Projects.AsNoTracking().Any(o => o.Id == projectIdAsInt)) { continue; } string baseUrl = "https://api.scratch.mit.edu/projects/{0}"; string projectInfoJson = JSONGetter.GetAsJSONString(string.Format(baseUrl, projectId)); JObject projectObject = JObject.Parse(projectInfoJson); ProjectAuthor author = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectAuthor>(projectObject["author"].ToString()); if (ctxt.Authors.AsNoTracking().Any(o => o.Id == author.id)) //If the author is known... { projectCache.TryGetValue(projectId, out string fileExtension); //Validate if it exists as a file... if (string.IsNullOrEmpty(fileExtension)) { p.DownloadProjectToFile(projectId); } Project newProject = p.ParseProject(projectInfoJson, false); newProject.AuthorId = author.id; ctxt.Projects.Add(newProject); ctxt.SaveChanges(); //TODO: Optionally immediately parse the actual project and its blocks. } else { Say($"Found project from unknown author: {author.id}"); } projectCache.Remove(projectId); //This way, the cache will immediately get rid of now useless entries } } }
public string GetAuthorJson(string userName) { string apiEndpoint = "https://api.scratch.mit.edu/users/" + userName; return(JSONGetter.GetAsJSONString(apiEndpoint)); }
public List <Project> GetProjectsByUsername(string userName, bool ignoreRemixes = false) { string apiEndpoint = "https://api.scratch.mit.edu/users/" + userName + "/projects?limit=40&offset={0}"; bool endOfDataReached = false; int offset = 0; List <Project> allProjectsOfUser = new List <Project>(); try { while (!endOfDataReached) { string specifiedApiEndpoint = string.Format(apiEndpoint, offset); string returnedProjects = JSONGetter.GetAsJSONString(specifiedApiEndpoint); if (string.IsNullOrEmpty(returnedProjects)) { break; } JArray parsedProjects = JArray.Parse(returnedProjects); if (parsedProjects.Count == 0) { endOfDataReached = true; } foreach (var project in parsedProjects) { JObject projectObject = JObject.Parse(project.ToString()); JObject remixObject = (JObject)projectObject["remix"]; JToken remixRootToken = remixObject["root"]; JToken remixParentToken = remixObject["parent"]; bool isRemixed = false; int remixParent = 0; int remixRoot = 0; if (!string.IsNullOrEmpty(remixRootToken.ToString())) //Check if this is a remixed project { isRemixed = true; remixRoot = Int32.Parse(remixRoot.ToString()); if (ignoreRemixes) { continue; } } if (!string.IsNullOrEmpty(remixParentToken.ToString())) { remixParent = Int32.Parse(remixParentToken.ToString()); if (ignoreRemixes) { continue; } } if (projectObject["is_published"].Value <bool>() == false) //Check if the project is published (not private) { Console.WriteLine($"P: {projectObject["id"]}"); continue; } ProjectStats projectStats = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectStats>(projectObject["stats"].ToString()); ProjectHistory projectHistory = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectHistory>(projectObject["history"].ToString()); Project toAdd = new Project { Id = Int32.Parse(projectObject["id"].ToString()), ProjectName = projectObject["title"].ToString(), AuthorId = 0, Author = null, Created = projectHistory.created, Modified = projectHistory.modified, TotalViews = projectStats.views, TotalFavorites = projectStats.favorites, TotalLoves = projectStats.loves, Shared = projectHistory.shared == null ? (DateTime)projectHistory.shared : DateTime.MinValue, IsRemix = isRemixed, RemixParent = remixParent, RemixRoot = remixRoot }; allProjectsOfUser.Add(toAdd); } offset += 40; } } catch (Exception ex) { Console.WriteLine(ex.Message); return(allProjectsOfUser); } return(allProjectsOfUser); }