Exemplo n.º 1
0
        public static PsxDc ScrapeIndividualInfo(PsxDc record)
        {
            // load page to string
            string html = new WebClient().DownloadString(record.InfoUrl);

            HtmlDocument doc = new HtmlDocument();

            doc.OptionFixNestedTags = true;
            doc.LoadHtml(html);

            // get all tables
            var findTables = from table in doc.DocumentNode.SelectNodes("//table").Cast <HtmlNode>()
                             from row in table.SelectNodes("tr").Cast <HtmlNode>()
                             select row;
            bool foundDev  = false;
            bool foundPub  = false;
            bool foundYear = false;

            foreach (HtmlNode row in findTables)
            {
                if (foundDev && foundPub && foundYear)
                {
                    break;
                }

                var cells = row.SelectNodes("th|td").ToArray();
                if (cells[0].InnerText.Contains("Developer"))
                {
                    record.Developer = cells[1].InnerText.Replace("&nbsp;", "").Split('\t').Last().Trim().TrimEnd('.');
                    foundDev         = true;
                }
                if (cells[0].InnerText.Contains("Publisher"))
                {
                    record.Publisher = cells[1].InnerText.Replace("&nbsp;", "").Split('\t').Last().Trim().TrimEnd('.');
                    foundPub         = true;
                }
                if (cells[0].InnerText.Contains("Date Released"))
                {
                    int      da   = 0;
                    string[] dArr = cells[1].InnerText.Split(' ');
                    // test whether numeric value
                    bool result = int.TryParse(dArr.Last().Split('\t').Last().Trim(), out da);
                    if (result == true)
                    {
                        record.Year = dArr.Last().Split('\t').Last().Trim();
                    }

                    foundYear = true;
                }
            }

            return(record);
        }
Exemplo n.º 2
0
        public static List <PsxDc> ScrapeInitialList()
        {
            List <PsxDc> list = new List <PsxDc>();
            HtmlDocument doc  = new HtmlDocument();

            doc.OptionFixNestedTags = true;

            List <string> urls = new List <string>
            {
                AppDomain.CurrentDomain.BaseDirectory + @"..\..\Data\System\DAT\PSXDATACENTER\ntscu.html",
                AppDomain.CurrentDomain.BaseDirectory + @"..\..\Data\System\DAT\PSXDATACENTER\pal.html",
                AppDomain.CurrentDomain.BaseDirectory + @"..\..\Data\System\DAT\PSXDATACENTER\ntscj.html",
            };

            foreach (string url in urls)
            {
                // load html file to string
                string s = File.ReadAllText(url);
                doc.LoadHtml(s);

                // get all tables
                var findTables = from table in doc.DocumentNode.SelectNodes("//table").Cast <HtmlNode>()
                                 from row in table.SelectNodes("tr").Cast <HtmlNode>()
                                 select row;

                // iterate through rows
                foreach (HtmlNode row in findTables)
                {
                    var cells = row.SelectNodes("th|td").ToArray();

                    // Get all the serial numbers first and iterate through them (if no serial number present - ignore)
                    string[] snss = cells[1].InnerText.Split(new string[] { "\r\n" }, StringSplitOptions.None);
                    string[] sns  = snss.Where(a => a.Trim() != "").ToArray();

                    for (int sn = 0; sn < sns.Length; sn++)
                    {
                        PsxDc record = new PsxDc();
                        record.Serial = sns[sn].Trim();

                        // get game name
                        string gText = cells[2].InnerText.Replace("&nbsp;", "").Replace("\r\n", "").Trim().Replace("  ", " ");
                        if (sns.Length > 1)
                        {
                            gText += " [Disc " + (sn + 1) + "]";
                        }
                        record.Name = gText;

                        // Languages
                        string langs = cells[3].InnerText.Replace("\r\n", "").Replace("&nbsp;", "").Trim();
                        record.Languages = langs;

                        // info url
                        if (cells[0].InnerHtml != "")
                        {
                            int    pFrom  = cells[0].InnerHtml.IndexOf("<a href=") + "<a href=".Length;
                            int    pTo    = cells[0].InnerHtml.LastIndexOf(" target=");
                            string result = cells[0].InnerHtml.Substring(pFrom, pTo - pFrom).Replace("\"", "");
                            record.InfoUrl = result;
                        }

                        // region
                        if (record.Serial.Contains("SLPS") ||
                            record.Serial.Contains("SCPS") ||
                            record.Serial.Contains("SLPM") ||
                            record.Serial.Contains("SIPS"))
                        {
                            record.Region = "JAP";
                        }

                        if (record.Serial.Contains("SLES") ||
                            record.Serial.Contains("SCES"))
                        {
                            record.Region = "EUR";
                        }

                        if (record.Serial.Contains("SLUS") ||
                            record.Serial.Contains("LSP") ||
                            record.Serial.Contains("SCUS"))
                        {
                            record.Region = "USA";
                        }


                        // add to master list
                        list.Add(record);
                    }
                }
            }


            return(list);
        }