Example #1
0
        static void Main(string[] args)
        {
            ScratchScraper s;

#if DEBUG
            Console.WriteLine("***\tZemi is running in Debug mode. This could impact application performance.");
            Console.WriteLine("***\tTo ensure optimal performance, run a Release build instead.");
#endif

            if (args.Length < 2)
            {
                Console.WriteLine("Incorrect number of arguments given." +
                                  "\n\t-project\tScraper the projects for all Authors in the connected database." +
                                  "\n\t-author\tScraper random authors from the front page." +
                                  "\n\t-heuristic\tPasses over scraped authors and scraper followers and followings." +
                                  "\n Followed by the path to the Zemi directory.");
                Console.ReadLine();
                return;
            }

            string type = args[0];
            string path = args[1].Trim('\"', '\'');
            if (!Directory.Exists(path))
            {
                Console.WriteLine($"Directory\"{path}\" did not exist. Creating main directory...");
                try { Directory.CreateDirectory(path); }
                catch (Exception ex) { Console.WriteLine(ex.Message); }
            }
            if (type.StartsWith("-project"))
            {
                ProjectScraper projectScraper = new ProjectScraper(Path.Combine(path, "ProjectScraper/"));
                projectScraper.Scrape();
            }
            else if (type.StartsWith("-author"))
            {
                AuthorScraper authorScraper = new AuthorScraper(Path.Combine(path, "AuthorScraper/"));
                authorScraper.Scrape();
            }
            else if (type.StartsWith("-validate"))
            {
                Console.WriteLine("Validating all Authors in database. This can take a very long time.");
                AuthorScraper authorScraper = new AuthorScraper(Path.Combine(path, "AuthorScraper/"));
                authorScraper.ValidateAllAuthorsInDatabase();
            }
            else if (type.StartsWith("-heuristic"))
            {
                AuthorScraper authorScraper = new AuthorScraper(Path.Combine(path, "AuthorScraper/"));
                authorScraper.HeuristicScrape(0);
            }
            while (true)
            {
                Console.ReadKey();
            }
        }
Example #2
0
        internal static void ProcessUnregisteredProjects(string pathToUnregisteredProjects, ProjectScraper p)
        {
            // Create a timer with a two second interval.
            System.Timers.Timer aTimer = new System.Timers.Timer(20000);
            // Hook up the Elapsed event for the timer.
            aTimer.Elapsed  += (Object source, ElapsedEventArgs e) => { Say("Not done yet.."); };
            aTimer.AutoReset = true;
            aTimer.Enabled   = true;


            Say($"Enumerating unregistered project files in {Path.Combine(pathToUnregisteredProjects, "UnregisteredProjects.txt")}.");
            string[] unregisteredProjectIds = File.ReadAllLines(Path.Combine(pathToUnregisteredProjects, "UnregisteredProjects.txt")).Distinct <string>().ToArray();
            Say($"Enumerating unregistered project files done");
            aTimer.Stop(); aTimer.Start();

            Say($"Enumerating existing project files in {pathToUnregisteredProjects}. This could take a very long time...");
            string[] fileNames = Directory.GetFiles(pathToUnregisteredProjects).Select(o => Path.GetFileName(o)).ToArray();
            Say($"Enumerating existing project files done.");
            aTimer.Stop(); aTimer.Start();

            Say($"Creating projects cache. This could take a very long time...");
            Dictionary <string, string> projectCache = new Dictionary <string, string>(fileNames.ToDictionary(x => x.Substring(0, x.IndexOf('.')), x => $".{x.Substring(x.IndexOf('.') + 1)}"));

            Say($"Creating projects cache done.");

            fileNames      = null; //Otherwise, millions of strings will be hanging around for no reason.
            aTimer.Enabled = false;
            aTimer         = null;
            GC.Collect();

            using (ApplicationDatabase ctxt = new ApplicationDatabase())
            {
                foreach (string projectId in unregisteredProjectIds)
                {
                    if (!Int32.TryParse(projectId, out int projectIdAsInt))
                    {
                        continue;
                    }
                    if (ctxt.Projects.AsNoTracking().Any(o => o.Id == projectIdAsInt))
                    {
                        continue;
                    }

                    string baseUrl         = "https://api.scratch.mit.edu/projects/{0}";
                    string projectInfoJson = JSONGetter.GetAsJSONString(string.Format(baseUrl, projectId));

                    JObject       projectObject = JObject.Parse(projectInfoJson);
                    ProjectAuthor author        = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectAuthor>(projectObject["author"].ToString());
                    if (ctxt.Authors.AsNoTracking().Any(o => o.Id == author.id))       //If the author is known...
                    {
                        projectCache.TryGetValue(projectId, out string fileExtension); //Validate if it exists as a file...
                        if (string.IsNullOrEmpty(fileExtension))
                        {
                            p.DownloadProjectToFile(projectId);
                        }

                        Project newProject = p.ParseProject(projectInfoJson, false);
                        newProject.AuthorId = author.id;
                        ctxt.Projects.Add(newProject);
                        ctxt.SaveChanges();

                        //TODO: Optionally immediately parse the actual project and its blocks.
                    }
                    else
                    {
                        Say($"Found project from unknown author: {author.id}");
                    }
                    projectCache.Remove(projectId); //This way, the cache will immediately get rid of now useless entries
                }
            }
        }