Example #1
0
        public static void KapitalSin()
        {
            List <string> AllLinks = new List <string>();
            string        OGLink   = "http://www.kapitalsin.com/forum/index.php?board=24.%page%"; //arg
            string        Link     = "";
            int           Tries    = 0;
            bool          Added    = false;
            int           LastPage = 0;
            var           BaseUrl  = new Uri(OGLink).Host;

            string[] PageVaildIfContains = new string[]
            {
                "énero",
                "amaño",
            };     //arg

            //Surface scrape
            for (int i = 0; i < MaxPages; i = i + 10)
            {
                Link = OGLink.Replace("%page%", $"{i}");
                var home = GetWebString(Link);
                var shit = GetBetween(home, "PHPSESSID=", "&amp;");
                home = home.Replace($"PHPSESSID={shit}&amp;", "");
                var matches = Regex.Matches(home, "<a href=\"(.*?)\"", RegexOptions.Singleline).Cast <Match>().Select(m => m.Groups[1].Value).ToList();
                foreach (var Match in matches)
                {
                    var _match = Match;
                    if (_match.Contains("#"))
                    {
                        _match = _match.Remove(Match.IndexOf('#'));
                    }
                    if (!AllLinks.Contains(_match) && _match.Contains(BaseUrl))
                    {
                        AllLinks.Add(_match);
                        Console.WriteLine(_match);
                        Tries = 0;
                        Added = true;
                    }
                }

                if (!Added)
                {
                    Tries++;
                }
                Added = false;
                if (Tries >= 3)
                {
                    LastPage = i;
                    break;
                }
            }


            string TitlePath = "//div[@class=\"keyinfo\"]/h5"; //arg
            //string ContentPath = "//*[@class='entry-content']"; //arg
            List <Entry> Entries = new List <Entry>();

            //Deep scrape
            foreach (var _Link in AllLinks)
            {
                bool valid = false;
                var  page  = GetWebString(_Link);
                valid = PageVaildIfContains.All(page.Contains);


                if (valid && new Uri(_Link).AbsolutePath != "/")
                {
                    var          e   = new Entry();
                    HtmlDocument Doc = new HtmlDocument();
                    Doc.LoadHtml(page);
                    e.Title = CleanText(System.Net.WebUtility.HtmlDecode(Doc.DocumentNode.SelectSingleNode(TitlePath).InnerText));
                    //e.Content = Doc.DocumentNode.SelectSingleNode(ContentPath).InnerHtml;
                    e.IndexDate = $"{DateTime.UtcNow.Year}-{DateTime.UtcNow.Month}-{DateTime.UtcNow.Day} {DateTime.UtcNow.Hour}:{DateTime.UtcNow.Minute}";
                    e.Repacker  = "KapitalSin";
                    e.Size      = GetBetween(page.ToUpper(), "AMAÑO", "B").ToUpper().Replace(",", ".").Replace("</STRONG>: ", "").Replace(":</STRONG> ", "") + "B";
                    e.Link      = _Link;
                    Entries.Add(e);
                    Console.WriteLine(JsonConvert.SerializeObject(e));
                }
            }

            var es = JsonConvert.SerializeObject(Entries);

            File.WriteAllText($"Data/{BaseUrl}.json", es);
        }
            /// <summary>
            /// If the sub string is contained in the target but in a different position
            /// return the difference between the two sub string positions.
            /// </summary>
            /// <param name="match"></param>
            /// <param name="node"></param>
            /// <returns>-1 if a score can't be determined, or the difference in positions</returns>
            protected override int GetScore(Match match, Node node)
            {
                var index = match.IndexOf(node);
                if (index >= 0)
                    return Math.Abs(node.Position + 1 - index);

                // Return -1 to indicate that a score could not be calculated.
                return -1;
            }