public List<string> GetLinksFromSite(WebSite webSite) { if (webSite == null) throw new ArgumentNullException("webSite"); string rawHtml = webSite.RawContent; if (string.IsNullOrWhiteSpace(rawHtml)) throw new ArgumentException("WebSite object had no raw content"); Regex rgx = new Regex("<a href=\"[^\\\"]+"); MatchCollection matches = rgx.Matches(rawHtml); if (!(matches.Count > 0)) return null; List<string> urls = new List<string>(); foreach (Match match in matches) { string url = null; try { //Kolla om a href="/artiklar/asdas. Lägg då till basen. DETTA FÖREKOMMER PÅ DI.SE //Denna lösning kan vara för enkel, när <a href="/..."> så verkar det vara som att //den alltid menar att ha www.di.se som bas if (match.Value.Substring(9, 1).Equals("/")) url = webSite.Uri.AbsoluteUri + match.Value.Substring(10); else url = match.Value.Substring(9); } catch (Exception e) { Console.WriteLine(e.Message); } finally { if (url != null && !urls.Contains(url)) { urls.Add(url); } } } return urls; }
public SiteSearchResult SearchSiteForWord(WebSite webSite, string word) { if (webSite == null) throw new ArgumentNullException("webSite"); if (webSite == null) throw new ArgumentNullException("word"); string rawHtml = webSite.RawContent; if (string.IsNullOrWhiteSpace(rawHtml)) throw new ArgumentException("WebSite object had no raw content"); _RawHTML = rawHtml; rawHtml = stripHtmlTags(rawHtml); //Console.WriteLine(rawHtml); _StrippedHTML = rawHtml; Regex rgx = new Regex(word + "[^\\b]?", RegexOptions.IgnoreCase); //Kolla till rgx! MatchCollection matches = rgx.Matches(rawHtml); if (matches.Count > 0) { return new SiteSearchResult(webSite, word) { Occurences = matches.Count, Matches = matches}; } return new SiteSearchResult(webSite, word) { Occurences = 0 }; }
private static void testWebLibrary() { SiteSearcher searcher = new SiteSearcher(); //string s = searcher.stripHtmlTags(""); //Console.WriteLine(s); //return; WebSite ws = new WebSite("www.di.se"); try { //Console.WriteLine(ws.RawContent); Console.WriteLine(ws.Uri.AbsoluteUri); ws.DownloadRawContent(); //Console.WriteLine(ws.RawContent); SiteSearchResult result = searcher.SearchSiteForWord(ws, "Aktier"); Console.WriteLine(result.Occurences); searcher.WriteToFile(); StreamReader reader1 = ws.RawContentStreamReader; string str2 = ""; while (str2 != null) { str2 = reader1.ReadLine(); Console.WriteLine(str2); //Console.ReadLine(); } return; foreach (Match match in result.Matches) { Console.WriteLine(match.Value); } } catch (Exception e) { Console.WriteLine("ERROR IN TEST:\n" + e.Message); } List<Uri> uris = new List<Uri>(); List<string> c = searcher.GetLinksFromSite(ws); foreach (string url in c) { Uri u = null; try { bool b = Uri.TryCreate(url, UriKind.Absolute, out u); if (b == false) continue; } catch (UriFormatException e) { Console.WriteLine(e.Message); Console.ReadLine(); } catch (Exception e) { Console.WriteLine(e.Message); } finally { if (u != null) { if (ws.Uri.IsBaseOf(u)) { uris.Add(u); Console.WriteLine(u.AbsoluteUri); } } } } Console.WriteLine(uris.Count); }
public SiteSearchResult(WebSite siteSearched, string forWord) { this.WebSiteSearched = siteSearched; this.Word = forWord; }