static void Main(string[] args) { ScratchScraper s; #if DEBUG Console.WriteLine("***\tZemi is running in Debug mode. This could impact application performance."); Console.WriteLine("***\tTo ensure optimal performance, run a Release build instead."); #endif if (args.Length < 2) { Console.WriteLine("Incorrect number of arguments given." + "\n\t-project\tScraper the projects for all Authors in the connected database." + "\n\t-author\tScraper random authors from the front page." + "\n\t-heuristic\tPasses over scraped authors and scraper followers and followings." + "\n Followed by the path to the Zemi directory."); Console.ReadLine(); return; } string type = args[0]; string path = args[1].Trim('\"', '\''); if (!Directory.Exists(path)) { Console.WriteLine($"Directory\"{path}\" did not exist. Creating main directory..."); try { Directory.CreateDirectory(path); } catch (Exception ex) { Console.WriteLine(ex.Message); } } if (type.StartsWith("-project")) { ProjectScraper projectScraper = new ProjectScraper(Path.Combine(path, "ProjectScraper/")); projectScraper.Scrape(); } else if (type.StartsWith("-author")) { AuthorScraper authorScraper = new AuthorScraper(Path.Combine(path, "AuthorScraper/")); authorScraper.Scrape(); } else if (type.StartsWith("-validate")) { Console.WriteLine("Validating all Authors in database. This can take a very long time."); AuthorScraper authorScraper = new AuthorScraper(Path.Combine(path, "AuthorScraper/")); authorScraper.ValidateAllAuthorsInDatabase(); } else if (type.StartsWith("-heuristic")) { AuthorScraper authorScraper = new AuthorScraper(Path.Combine(path, "AuthorScraper/")); authorScraper.HeuristicScrape(0); } while (true) { Console.ReadKey(); } }
internal static void ProcessUnregisteredProjects(string pathToUnregisteredProjects, ProjectScraper p) { // Create a timer with a two second interval. System.Timers.Timer aTimer = new System.Timers.Timer(20000); // Hook up the Elapsed event for the timer. aTimer.Elapsed += (Object source, ElapsedEventArgs e) => { Say("Not done yet.."); }; aTimer.AutoReset = true; aTimer.Enabled = true; Say($"Enumerating unregistered project files in {Path.Combine(pathToUnregisteredProjects, "UnregisteredProjects.txt")}."); string[] unregisteredProjectIds = File.ReadAllLines(Path.Combine(pathToUnregisteredProjects, "UnregisteredProjects.txt")).Distinct <string>().ToArray(); Say($"Enumerating unregistered project files done"); aTimer.Stop(); aTimer.Start(); Say($"Enumerating existing project files in {pathToUnregisteredProjects}. This could take a very long time..."); string[] fileNames = Directory.GetFiles(pathToUnregisteredProjects).Select(o => Path.GetFileName(o)).ToArray(); Say($"Enumerating existing project files done."); aTimer.Stop(); aTimer.Start(); Say($"Creating projects cache. This could take a very long time..."); Dictionary <string, string> projectCache = new Dictionary <string, string>(fileNames.ToDictionary(x => x.Substring(0, x.IndexOf('.')), x => $".{x.Substring(x.IndexOf('.') + 1)}")); Say($"Creating projects cache done."); fileNames = null; //Otherwise, millions of strings will be hanging around for no reason. aTimer.Enabled = false; aTimer = null; GC.Collect(); using (ApplicationDatabase ctxt = new ApplicationDatabase()) { foreach (string projectId in unregisteredProjectIds) { if (!Int32.TryParse(projectId, out int projectIdAsInt)) { continue; } if (ctxt.Projects.AsNoTracking().Any(o => o.Id == projectIdAsInt)) { continue; } string baseUrl = "https://api.scratch.mit.edu/projects/{0}"; string projectInfoJson = JSONGetter.GetAsJSONString(string.Format(baseUrl, projectId)); JObject projectObject = JObject.Parse(projectInfoJson); ProjectAuthor author = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectAuthor>(projectObject["author"].ToString()); if (ctxt.Authors.AsNoTracking().Any(o => o.Id == author.id)) //If the author is known... { projectCache.TryGetValue(projectId, out string fileExtension); //Validate if it exists as a file... if (string.IsNullOrEmpty(fileExtension)) { p.DownloadProjectToFile(projectId); } Project newProject = p.ParseProject(projectInfoJson, false); newProject.AuthorId = author.id; ctxt.Projects.Add(newProject); ctxt.SaveChanges(); //TODO: Optionally immediately parse the actual project and its blocks. } else { Say($"Found project from unknown author: {author.id}"); } projectCache.Remove(projectId); //This way, the cache will immediately get rid of now useless entries } } }