Esempio n. 1
0
        public List<string> GetLinksFromSite(WebSite webSite)
        {
            if (webSite == null) throw new ArgumentNullException("webSite");

            string rawHtml = webSite.RawContent;
            if (string.IsNullOrWhiteSpace(rawHtml))
                throw new ArgumentException("WebSite object had no raw content");

            Regex rgx = new Regex("<a href=\"[^\\\"]+");
            MatchCollection matches = rgx.Matches(rawHtml);

            if (!(matches.Count > 0))
                return null;

            List<string> urls = new List<string>();

            foreach (Match match in matches)
            {
                string url = null;
                try
                {
                    //Kolla om a href="/artiklar/asdas. Lägg då till basen. DETTA FÖREKOMMER PÅ DI.SE
                    //Denna lösning kan vara för enkel, när <a href="/..."> så verkar det vara som att
                    //den alltid menar att ha www.di.se som bas
                    if (match.Value.Substring(9, 1).Equals("/"))
                        url = webSite.Uri.AbsoluteUri + match.Value.Substring(10);
                    else
                        url = match.Value.Substring(9);
                }
                catch (Exception e)
                {
                    Console.WriteLine(e.Message);
                }
                finally
                {
                    if (url != null && !urls.Contains(url))
                    {
                        urls.Add(url);
                    }
                }
            }
            return urls;
        }
Esempio n. 2
0
        public SiteSearchResult SearchSiteForWord(WebSite webSite, string word)
        {
            if (webSite == null) throw new ArgumentNullException("webSite");
            if (webSite == null) throw new ArgumentNullException("word");

            string rawHtml = webSite.RawContent;
            if (string.IsNullOrWhiteSpace(rawHtml))
                throw new ArgumentException("WebSite object had no raw content");
            _RawHTML = rawHtml;

            rawHtml = stripHtmlTags(rawHtml);
            //Console.WriteLine(rawHtml);
            _StrippedHTML = rawHtml;

            Regex rgx = new Regex(word + "[^\\b]?", RegexOptions.IgnoreCase); //Kolla till rgx!

            MatchCollection matches = rgx.Matches(rawHtml);
            if (matches.Count > 0)
            {
                return new SiteSearchResult(webSite, word) { Occurences = matches.Count, Matches = matches};
            }
            return new SiteSearchResult(webSite, word) { Occurences = 0 };
        }
Esempio n. 3
0
        private static void testWebLibrary()
        {
            SiteSearcher searcher = new SiteSearcher();
            //string s = searcher.stripHtmlTags("");
            //Console.WriteLine(s);
            //return;
            WebSite ws = new WebSite("www.di.se");
            try
            {
                //Console.WriteLine(ws.RawContent);
                Console.WriteLine(ws.Uri.AbsoluteUri);
                ws.DownloadRawContent();
                //Console.WriteLine(ws.RawContent);

                SiteSearchResult result = searcher.SearchSiteForWord(ws, "Aktier");
                Console.WriteLine(result.Occurences);
                searcher.WriteToFile();
                StreamReader reader1 = ws.RawContentStreamReader;
                string str2 = "";
                while (str2 != null)
                {
                    str2 = reader1.ReadLine();
                    Console.WriteLine(str2);
                    //Console.ReadLine();
                }
                return;

                foreach (Match match in result.Matches)
                {
                    Console.WriteLine(match.Value);
                }
            }
            catch (Exception e)
            {
                Console.WriteLine("ERROR IN TEST:\n" + e.Message);
            }

            List<Uri> uris = new List<Uri>();
            List<string> c = searcher.GetLinksFromSite(ws);
            foreach (string url in c)
            {
                Uri u = null;
                try
                {
                    bool b = Uri.TryCreate(url, UriKind.Absolute, out u);
                    if (b == false)
                        continue;
                }
                catch (UriFormatException e)
                {
                    Console.WriteLine(e.Message);
                    Console.ReadLine();
                }
                catch (Exception e)
                {
                    Console.WriteLine(e.Message);
                }
                finally
                {
                    if (u != null)
                    {
                        if (ws.Uri.IsBaseOf(u))
                        {
                        uris.Add(u);
                        Console.WriteLine(u.AbsoluteUri);
                        }
                    }
                }
            }
            Console.WriteLine(uris.Count);
        }
Esempio n. 4
0
 public SiteSearchResult(WebSite siteSearched, string forWord)
 {
     this.WebSiteSearched = siteSearched;
     this.Word = forWord;
 }