public void ParseUrl(string url)
        {
            McdiCardParser cardParser = new McdiCardParser(database);

            UrlScraper scraper = new UrlScraper();
            string html = scraper.Scrape(McdiSiteParser.PREFIX + url);

            int startpos = html.IndexOf("class=\"even\"");
            string leftToken = "<td><a href=\"";
            string splitToken = "\">";
            string rightToken = "</a>";

            // Process all the cards
            int curpos = startpos;
            while (html.IndexOf(leftToken, curpos) != -1)
            {
                int leftIndex = html.IndexOf(leftToken, curpos);
                int splitIndex = html.IndexOf(splitToken, leftIndex);
                int rightIndex = html.IndexOf(rightToken, splitIndex);

                string cardUrl = html.Substring(leftIndex + leftToken.Length, splitIndex - leftIndex - leftToken.Length);
                string cardName = html.Substring(splitIndex + splitToken.Length, rightIndex - splitIndex - splitToken.Length);

                // Process the card
                //System.Console.WriteLine(cardName + ":" + cardUrl);
                cardParser.ParseUrl(cardUrl);

                curpos = rightIndex;
            }
        }
        public void Parse()
        {
            {
                string url = "sitemap.html";
                UrlScraper scraper = new UrlScraper();
                string html = scraper.Scrape(McdiSiteParser.PREFIX + url);

                int startpos = html.IndexOf("<h2>");
                //int endpos = startpos + 400;
                int endpos = html.IndexOf("<h2>", startpos + 1);
                string leftToken = "<li><a href=\"";
                string split1Token = "\">";
                string split2Token = "</a> <small style=\"color: #aaa;\">";
                string rightToken = "</small></li>";

                // While what we are about to process is english...
                int curpos = startpos;
                while (html.IndexOf(leftToken, curpos) < endpos)
                {
                    int leftIndex = html.IndexOf(leftToken, curpos);
                    int split1Index = html.IndexOf(split1Token, leftIndex);
                    int split2Index = html.IndexOf(split2Token, split1Index);
                    int rightIndex = html.IndexOf(rightToken, split2Index);

                    Edition edition = new Edition();
                    edition.SourceUrl = html.Substring(leftIndex + leftToken.Length, split1Index - leftIndex - leftToken.Length);
                    edition.Name = html.Substring(split1Index + split1Token.Length, split2Index - split1Index - split1Token.Length);
                    edition.Abbreviation = html.Substring(split2Index + split2Token.Length, rightIndex - split2Index - split2Token.Length);

                    // Collect
                    database.Editions.Add(edition);

                    curpos = rightIndex;
                }
            }

            // Process the sets
            foreach (Edition ed in database.Editions)
            {
                System.Console.WriteLine("====" + ed + ":" + ed.SourceUrl + "====");
                McdiEditionParser editionParser = new McdiEditionParser(database);
                editionParser.ParseUrl(ed.SourceUrl);
                //break;
            }
        }
Exemple #3
0
 public Crawler(UrlScraper urlScraper)
 {
     _urlScraper = urlScraper;
 }