Пример #1
0
        public void Scrape(int skip = 0)
        {
            string[] allURLSToConsider = new string[]
            {
                "https://api.scratch.mit.edu/explore/projects?limit=40&offset={0}&mode=trending&q=*",
                "https://api.scratch.mit.edu/explore/projects?limit=40&offset={0}&mode=popular&q=*",
                "https://api.scratch.mit.edu/explore/projects?limit=40&offset={0}&mode=recent&q=*",
                "https://api.scratch.mit.edu/search/projects?limit=40q=*&offset={0}",
            };

            foreach (string URL in allURLSToConsider)
            {
                int    offset       = skip;
                string baseURL      = URL;
                bool   stopScraping = false;
                try
                {
                    while (stopScraping != true)
                    {
                        Console.WriteLine("Scraping at offset: " + offset.ToString());

                        string specificURL = string.Format(baseURL, offset.ToString());
                        string rawJson     = JSONGetter.GetAsJSONString(specificURL);
                        if (string.IsNullOrEmpty(rawJson))
                        {
                            Console.WriteLine("\t\tGetJSON2 returned null."); continue;
                        }

                        dynamic projectsObject = JsonValue.Parse(rawJson);
                        List <ProjectAuthor> scrapedAuthors = new List <ProjectAuthor>();
                        foreach (var projectData in projectsObject)
                        {
                            string        authorJson    = GetAuthorJson(projectData["author"]["username"].ReadAs <string>());
                            ProjectAuthor projectAuthor = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectAuthor>(authorJson);
                            scrapedAuthors.Add(projectAuthor);
                            WriteAuthorToFile(projectAuthor.username, authorJson);
                        }

                        SaveAuthorsToDatabase(ProjectAuthorsToDatabaseEntities(scrapedAuthors.GroupBy(x => x.id).Select(y => y.First()).ToList()));

                        offset += 40;
                        if (offset > 9980)
                        {
                            stopScraping = true;
                        }
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine($"Exception ocurred: {ex.Message}");
                    offset += 40;
                    continue;
                }
            }
        }
Пример #2
0
        private void ValidateAuthor(Author toCheck, string outputDirectory)
        {
            string authorFilePath = Path.Combine(outputDirectory, $"{toCheck.Username}.json");
            string authorJson     = "";

            if (File.Exists(authorFilePath))
            {
                authorJson = File.ReadAllText(authorFilePath);
                if (string.IsNullOrEmpty(authorJson) || string.IsNullOrWhiteSpace(authorJson))
                {
                    authorJson = GetAuthorJson(toCheck.Username);
                    Console.WriteLine($"0-length file encountered for {toCheck.Username}");
                    WriteAuthorToFile(toCheck.Username, authorJson, true);
                }
            }
            else
            {
                authorJson = GetAuthorJson(toCheck.Username);
                WriteAuthorToFile(toCheck.Username, authorJson);
            }

            ProjectAuthor expectedAuthor = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectAuthor>(authorJson);

            if (toCheck.Username != expectedAuthor.username)
            {
                toCheck.Username = expectedAuthor.username;
            }
            if (toCheck.DateJoined != expectedAuthor.history.joined)
            {
                toCheck.DateJoined = expectedAuthor.history.joined;
            }
            if (toCheck.DateLastLogged != expectedAuthor.history.login)
            {
                toCheck.DateLastLogged = expectedAuthor.history.login;
            }
            if (toCheck.Country != expectedAuthor.profile.country)
            {
                toCheck.Country = expectedAuthor.profile.country;
            }
            UpdateAuthor(toCheck);
        }
Пример #3
0
        internal static void ProcessUnregisteredProjects(string pathToUnregisteredProjects, ProjectScraper p)
        {
            // Create a timer with a two second interval.
            System.Timers.Timer aTimer = new System.Timers.Timer(20000);
            // Hook up the Elapsed event for the timer.
            aTimer.Elapsed  += (Object source, ElapsedEventArgs e) => { Say("Not done yet.."); };
            aTimer.AutoReset = true;
            aTimer.Enabled   = true;


            Say($"Enumerating unregistered project files in {Path.Combine(pathToUnregisteredProjects, "UnregisteredProjects.txt")}.");
            string[] unregisteredProjectIds = File.ReadAllLines(Path.Combine(pathToUnregisteredProjects, "UnregisteredProjects.txt")).Distinct <string>().ToArray();
            Say($"Enumerating unregistered project files done");
            aTimer.Stop(); aTimer.Start();

            Say($"Enumerating existing project files in {pathToUnregisteredProjects}. This could take a very long time...");
            string[] fileNames = Directory.GetFiles(pathToUnregisteredProjects).Select(o => Path.GetFileName(o)).ToArray();
            Say($"Enumerating existing project files done.");
            aTimer.Stop(); aTimer.Start();

            Say($"Creating projects cache. This could take a very long time...");
            Dictionary <string, string> projectCache = new Dictionary <string, string>(fileNames.ToDictionary(x => x.Substring(0, x.IndexOf('.')), x => $".{x.Substring(x.IndexOf('.') + 1)}"));

            Say($"Creating projects cache done.");

            fileNames      = null; //Otherwise, millions of strings will be hanging around for no reason.
            aTimer.Enabled = false;
            aTimer         = null;
            GC.Collect();

            using (ApplicationDatabase ctxt = new ApplicationDatabase())
            {
                foreach (string projectId in unregisteredProjectIds)
                {
                    if (!Int32.TryParse(projectId, out int projectIdAsInt))
                    {
                        continue;
                    }
                    if (ctxt.Projects.AsNoTracking().Any(o => o.Id == projectIdAsInt))
                    {
                        continue;
                    }

                    string baseUrl         = "https://api.scratch.mit.edu/projects/{0}";
                    string projectInfoJson = JSONGetter.GetAsJSONString(string.Format(baseUrl, projectId));

                    JObject       projectObject = JObject.Parse(projectInfoJson);
                    ProjectAuthor author        = Newtonsoft.Json.JsonConvert.DeserializeObject <ProjectAuthor>(projectObject["author"].ToString());
                    if (ctxt.Authors.AsNoTracking().Any(o => o.Id == author.id))       //If the author is known...
                    {
                        projectCache.TryGetValue(projectId, out string fileExtension); //Validate if it exists as a file...
                        if (string.IsNullOrEmpty(fileExtension))
                        {
                            p.DownloadProjectToFile(projectId);
                        }

                        Project newProject = p.ParseProject(projectInfoJson, false);
                        newProject.AuthorId = author.id;
                        ctxt.Projects.Add(newProject);
                        ctxt.SaveChanges();

                        //TODO: Optionally immediately parse the actual project and its blocks.
                    }
                    else
                    {
                        Say($"Found project from unknown author: {author.id}");
                    }
                    projectCache.Remove(projectId); //This way, the cache will immediately get rid of now useless entries
                }
            }
        }