Example #1
0
        /// <summary>
        /// This method can be used for collecting data from API endpoints that return JSON arrays of User objects.
        /// These API endpoints are characterized by the /users/ route.
        /// Examples are users following a specific user, or the users that specific user is following.
        /// </summary>
        /// <param name="userName">The first parameter to the /users/ API endpoint.</param>
        /// <param name="route">The route modifier. Either "followers" or "following"</param>
        /// <returns></returns>
        private JsonArray GetAllFollowersOrFollowingsByUsername(string userName, string route)
        {
            string    apiEndpoint      = "https://api.scratch.mit.edu/users/" + userName + "/" + route + "?limit=40&offset={0}";
            bool      endOfDataReached = false;
            int       offset           = 0;
            JsonArray allUsers         = new JsonArray();

            while (!endOfDataReached)
            {
                string specifiedApiEndpoint = string.Format(apiEndpoint, offset);
                string returnedUsersJson    = JSONGetter.GetAsJSONString(specifiedApiEndpoint);
                if (string.IsNullOrEmpty(returnedUsersJson))
                {
                    break;
                }
                var parsedUsers = JsonValue.Parse(returnedUsersJson);
                if (parsedUsers.Count == 0)
                {
                    endOfDataReached = true;
                }
                foreach (var follower in parsedUsers)
                {
                    allUsers.Add(follower.Value);
                }
                offset += 40;
            }
            return(allUsers);
        }
Example #2
0
        public void Scrape(int skip = 0)
        {
            string[] allURLSToConsider = new string[]
            {
                "https://api.scratch.mit.edu/explore/projects?limit=40&offset={0}&mode=trending&q=*",
                "https://api.scratch.mit.edu/explore/projects?limit=40&offset={0}&mode=popular&q=*",
                "https://api.scratch.mit.edu/explore/projects?limit=40&offset={0}&mode=recent&q=*",
                "https://api.scratch.mit.edu/search/projects?limit=40q=*&offset={0}",
            };

            foreach (string URL in allURLSToConsider)
            {
                int    offset       = skip;
                string baseURL      = URL;
                bool   stopScraping = false;
                try
                {
                    while (stopScraping != true)
                    {
                        Console.WriteLine("Scraping at offset: " + offset.ToString());

                        string specificURL = string.Format(baseURL, offset.ToString());
                        string rawJson     = JSONGetter.GetAsJSONString(specificURL);
                        if (string.IsNullOrEmpty(rawJson))
                        {
                            Console.WriteLine("\t\tGetJSON2 returned null."); continue;
                        }

                        dynamic projectsObject = JsonValue.Parse(rawJson);
                        List <ProjectAuthor> scrapedAuthors = new List <ProjectAuthor>();
                        foreach (var projectData in projectsObject)
                        {
                            string        authorJson    = GetAuthorJson(projectData["author"]["username"].ReadAs <string>());
                            ProjectAuthor projectAuthor = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectAuthor>(authorJson);
                            scrapedAuthors.Add(projectAuthor);
                            WriteAuthorToFile(projectAuthor.username, authorJson);
                        }

                        SaveAuthorsToDatabase(ProjectAuthorsToDatabaseEntities(scrapedAuthors.GroupBy(x => x.id).Select(y => y.First()).ToList()));

                        offset += 40;
                        if (offset > 9980)
                        {
                            stopScraping = true;
                        }
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine($"Exception ocurred: {ex.Message}");
                    offset += 40;
                    continue;
                }
            }
        }
Example #3
0
        internal static void ProcessUnregisteredProjects(string pathToUnregisteredProjects, ProjectScraper p)
        {
            // Create a timer with a two second interval.
            System.Timers.Timer aTimer = new System.Timers.Timer(20000);
            // Hook up the Elapsed event for the timer.
            aTimer.Elapsed  += (Object source, ElapsedEventArgs e) => { Say("Not done yet.."); };
            aTimer.AutoReset = true;
            aTimer.Enabled   = true;


            Say($"Enumerating unregistered project files in {Path.Combine(pathToUnregisteredProjects, "UnregisteredProjects.txt")}.");
            string[] unregisteredProjectIds = File.ReadAllLines(Path.Combine(pathToUnregisteredProjects, "UnregisteredProjects.txt")).Distinct <string>().ToArray();
            Say($"Enumerating unregistered project files done");
            aTimer.Stop(); aTimer.Start();

            Say($"Enumerating existing project files in {pathToUnregisteredProjects}. This could take a very long time...");
            string[] fileNames = Directory.GetFiles(pathToUnregisteredProjects).Select(o => Path.GetFileName(o)).ToArray();
            Say($"Enumerating existing project files done.");
            aTimer.Stop(); aTimer.Start();

            Say($"Creating projects cache. This could take a very long time...");
            Dictionary <string, string> projectCache = new Dictionary <string, string>(fileNames.ToDictionary(x => x.Substring(0, x.IndexOf('.')), x => $".{x.Substring(x.IndexOf('.') + 1)}"));

            Say($"Creating projects cache done.");

            fileNames      = null; //Otherwise, millions of strings will be hanging around for no reason.
            aTimer.Enabled = false;
            aTimer         = null;
            GC.Collect();

            using (ApplicationDatabase ctxt = new ApplicationDatabase())
            {
                foreach (string projectId in unregisteredProjectIds)
                {
                    if (!Int32.TryParse(projectId, out int projectIdAsInt))
                    {
                        continue;
                    }
                    if (ctxt.Projects.AsNoTracking().Any(o => o.Id == projectIdAsInt))
                    {
                        continue;
                    }

                    string baseUrl         = "https://api.scratch.mit.edu/projects/{0}";
                    string projectInfoJson = JSONGetter.GetAsJSONString(string.Format(baseUrl, projectId));

                    JObject       projectObject = JObject.Parse(projectInfoJson);
                    ProjectAuthor author        = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectAuthor>(projectObject["author"].ToString());
                    if (ctxt.Authors.AsNoTracking().Any(o => o.Id == author.id))       //If the author is known...
                    {
                        projectCache.TryGetValue(projectId, out string fileExtension); //Validate if it exists as a file...
                        if (string.IsNullOrEmpty(fileExtension))
                        {
                            p.DownloadProjectToFile(projectId);
                        }

                        Project newProject = p.ParseProject(projectInfoJson, false);
                        newProject.AuthorId = author.id;
                        ctxt.Projects.Add(newProject);
                        ctxt.SaveChanges();

                        //TODO: Optionally immediately parse the actual project and its blocks.
                    }
                    else
                    {
                        Say($"Found project from unknown author: {author.id}");
                    }
                    projectCache.Remove(projectId); //This way, the cache will immediately get rid of now useless entries
                }
            }
        }
Example #4
0
        public string GetAuthorJson(string userName)
        {
            string apiEndpoint = "https://api.scratch.mit.edu/users/" + userName;

            return(JSONGetter.GetAsJSONString(apiEndpoint));
        }
Example #5
0
        public List <Project> GetProjectsByUsername(string userName, bool ignoreRemixes = false)
        {
            string         apiEndpoint       = "https://api.scratch.mit.edu/users/" + userName + "/projects?limit=40&offset={0}";
            bool           endOfDataReached  = false;
            int            offset            = 0;
            List <Project> allProjectsOfUser = new List <Project>();

            try
            {
                while (!endOfDataReached)
                {
                    string specifiedApiEndpoint = string.Format(apiEndpoint, offset);
                    string returnedProjects     = JSONGetter.GetAsJSONString(specifiedApiEndpoint);
                    if (string.IsNullOrEmpty(returnedProjects))
                    {
                        break;
                    }
                    JArray parsedProjects = JArray.Parse(returnedProjects);
                    if (parsedProjects.Count == 0)
                    {
                        endOfDataReached = true;
                    }
                    foreach (var project in parsedProjects)
                    {
                        JObject projectObject    = JObject.Parse(project.ToString());
                        JObject remixObject      = (JObject)projectObject["remix"];
                        JToken  remixRootToken   = remixObject["root"];
                        JToken  remixParentToken = remixObject["parent"];
                        bool    isRemixed        = false;
                        int     remixParent      = 0;
                        int     remixRoot        = 0;
                        if (!string.IsNullOrEmpty(remixRootToken.ToString())) //Check if this is a remixed project
                        {
                            isRemixed = true;
                            remixRoot = Int32.Parse(remixRoot.ToString());
                            if (ignoreRemixes)
                            {
                                continue;
                            }
                        }
                        if (!string.IsNullOrEmpty(remixParentToken.ToString()))
                        {
                            remixParent = Int32.Parse(remixParentToken.ToString());
                            if (ignoreRemixes)
                            {
                                continue;
                            }
                        }
                        if (projectObject["is_published"].Value <bool>() == false) //Check if the project is published (not private)
                        {
                            Console.WriteLine($"P: {projectObject["id"]}");
                            continue;
                        }
                        ProjectStats   projectStats   = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectStats>(projectObject["stats"].ToString());
                        ProjectHistory projectHistory = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectHistory>(projectObject["history"].ToString());
                        Project        toAdd          = new Project
                        {
                            Id             = Int32.Parse(projectObject["id"].ToString()),
                            ProjectName    = projectObject["title"].ToString(),
                            AuthorId       = 0,
                            Author         = null,
                            Created        = projectHistory.created,
                            Modified       = projectHistory.modified,
                            TotalViews     = projectStats.views,
                            TotalFavorites = projectStats.favorites,
                            TotalLoves     = projectStats.loves,
                            Shared         = projectHistory.shared == null ? (DateTime)projectHistory.shared : DateTime.MinValue,
                            IsRemix        = isRemixed,
                            RemixParent    = remixParent,
                            RemixRoot      = remixRoot
                        };
                        allProjectsOfUser.Add(toAdd);
                    }

                    offset += 40;
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
                return(allProjectsOfUser);
            }
            return(allProjectsOfUser);
        }