public void Scrape(int skip = 0) { string[] allURLSToConsider = new string[] { "https://api.scratch.mit.edu/explore/projects?limit=40&offset={0}&mode=trending&q=*", "https://api.scratch.mit.edu/explore/projects?limit=40&offset={0}&mode=popular&q=*", "https://api.scratch.mit.edu/explore/projects?limit=40&offset={0}&mode=recent&q=*", "https://api.scratch.mit.edu/search/projects?limit=40q=*&offset={0}", }; foreach (string URL in allURLSToConsider) { int offset = skip; string baseURL = URL; bool stopScraping = false; try { while (stopScraping != true) { Console.WriteLine("Scraping at offset: " + offset.ToString()); string specificURL = string.Format(baseURL, offset.ToString()); string rawJson = JSONGetter.GetAsJSONString(specificURL); if (string.IsNullOrEmpty(rawJson)) { Console.WriteLine("\t\tGetJSON2 returned null."); continue; } dynamic projectsObject = JsonValue.Parse(rawJson); List <ProjectAuthor> scrapedAuthors = new List <ProjectAuthor>(); foreach (var projectData in projectsObject) { string authorJson = GetAuthorJson(projectData["author"]["username"].ReadAs <string>()); ProjectAuthor projectAuthor = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectAuthor>(authorJson); scrapedAuthors.Add(projectAuthor); WriteAuthorToFile(projectAuthor.username, authorJson); } SaveAuthorsToDatabase(ProjectAuthorsToDatabaseEntities(scrapedAuthors.GroupBy(x => x.id).Select(y => y.First()).ToList())); offset += 40; if (offset > 9980) { stopScraping = true; } } } catch (Exception ex) { Console.WriteLine($"Exception ocurred: {ex.Message}"); offset += 40; continue; } } }
private void ValidateAuthor(Author toCheck, string outputDirectory) { string authorFilePath = Path.Combine(outputDirectory, $"{toCheck.Username}.json"); string authorJson = ""; if (File.Exists(authorFilePath)) { authorJson = File.ReadAllText(authorFilePath); if (string.IsNullOrEmpty(authorJson) || string.IsNullOrWhiteSpace(authorJson)) { authorJson = GetAuthorJson(toCheck.Username); Console.WriteLine($"0-length file encountered for {toCheck.Username}"); WriteAuthorToFile(toCheck.Username, authorJson, true); } } else { authorJson = GetAuthorJson(toCheck.Username); WriteAuthorToFile(toCheck.Username, authorJson); } ProjectAuthor expectedAuthor = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectAuthor>(authorJson); if (toCheck.Username != expectedAuthor.username) { toCheck.Username = expectedAuthor.username; } if (toCheck.DateJoined != expectedAuthor.history.joined) { toCheck.DateJoined = expectedAuthor.history.joined; } if (toCheck.DateLastLogged != expectedAuthor.history.login) { toCheck.DateLastLogged = expectedAuthor.history.login; } if (toCheck.Country != expectedAuthor.profile.country) { toCheck.Country = expectedAuthor.profile.country; } UpdateAuthor(toCheck); }
internal static void ProcessUnregisteredProjects(string pathToUnregisteredProjects, ProjectScraper p) { // Create a timer with a two second interval. System.Timers.Timer aTimer = new System.Timers.Timer(20000); // Hook up the Elapsed event for the timer. aTimer.Elapsed += (Object source, ElapsedEventArgs e) => { Say("Not done yet.."); }; aTimer.AutoReset = true; aTimer.Enabled = true; Say($"Enumerating unregistered project files in {Path.Combine(pathToUnregisteredProjects, "UnregisteredProjects.txt")}."); string[] unregisteredProjectIds = File.ReadAllLines(Path.Combine(pathToUnregisteredProjects, "UnregisteredProjects.txt")).Distinct <string>().ToArray(); Say($"Enumerating unregistered project files done"); aTimer.Stop(); aTimer.Start(); Say($"Enumerating existing project files in {pathToUnregisteredProjects}. This could take a very long time..."); string[] fileNames = Directory.GetFiles(pathToUnregisteredProjects).Select(o => Path.GetFileName(o)).ToArray(); Say($"Enumerating existing project files done."); aTimer.Stop(); aTimer.Start(); Say($"Creating projects cache. This could take a very long time..."); Dictionary <string, string> projectCache = new Dictionary <string, string>(fileNames.ToDictionary(x => x.Substring(0, x.IndexOf('.')), x => $".{x.Substring(x.IndexOf('.') + 1)}")); Say($"Creating projects cache done."); fileNames = null; //Otherwise, millions of strings will be hanging around for no reason. aTimer.Enabled = false; aTimer = null; GC.Collect(); using (ApplicationDatabase ctxt = new ApplicationDatabase()) { foreach (string projectId in unregisteredProjectIds) { if (!Int32.TryParse(projectId, out int projectIdAsInt)) { continue; } if (ctxt.Projects.AsNoTracking().Any(o => o.Id == projectIdAsInt)) { continue; } string baseUrl = "https://api.scratch.mit.edu/projects/{0}"; string projectInfoJson = JSONGetter.GetAsJSONString(string.Format(baseUrl, projectId)); JObject projectObject = JObject.Parse(projectInfoJson); ProjectAuthor author = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectAuthor>(projectObject["author"].ToString()); if (ctxt.Authors.AsNoTracking().Any(o => o.Id == author.id)) //If the author is known... { projectCache.TryGetValue(projectId, out string fileExtension); //Validate if it exists as a file... if (string.IsNullOrEmpty(fileExtension)) { p.DownloadProjectToFile(projectId); } Project newProject = p.ParseProject(projectInfoJson, false); newProject.AuthorId = author.id; ctxt.Projects.Add(newProject); ctxt.SaveChanges(); //TODO: Optionally immediately parse the actual project and its blocks. } else { Say($"Found project from unknown author: {author.id}"); } projectCache.Remove(projectId); //This way, the cache will immediately get rid of now useless entries } } }