private static string GetXrmlTextAll(IEnumerable <string> urls) { const string xpath = @"//div[@style='font-family: monospace;']"; var sb = new StringBuilder(); foreach (var url in urls) { try { var source = WebFetch.Fetch(url); var doc = new HtmlDocument(); doc.LoadHtml(source); var target = doc.DocumentNode.SelectSingleNode(xpath); var text = HttpUtility.HtmlDecode(target.InnerText); sb.Append(text); } catch (Exception ex) { // TODO: swallow it whole! // yikes. or maybe log the issue, down the road.... } } return(sb.ToString()); }
private IEnumerable <string> GetRecentGutenbergBooks() { var outlinks = new List <string>(); // <div class="pgdbrecent"> //<h2><a name="a267"></a><a href="/browse/authors/a#a267">Appleton, Victor [pseud.]</a></h2> //<ul> // <li class="pgdbetext"><a href="/ebooks/952">Tom Swift and His Air Glider, or Seeking the Platinum Treasure</a> (English) (as Author)</li> // <li class="pgdbetext"><a href="/ebooks/4711">Tom Swift in the City of Gold, or, Marvelous Adventures Underground</a> (English) (as Author)</li> //</ul> const string url = "http://www.gutenberg.org/browse/recent/last1"; const string xpath = "//div[@class='pgdbrecent']/ul/li/a"; var source = WebFetch.Fetch(url); var doc = new HtmlDocument(); doc.LoadHtml(source); var links = doc.DocumentNode.SelectNodes(xpath); foreach (var l in links) { // TODO: this does not always exists // so we will need to trap for missing files.... var link = "http://www.gutenberg.org" + l.Attributes["href"].Value + ".txt.utf8"; outlinks.Add(link); } return(outlinks); }
// perhaps cache this list locally, and only regen as required? // because it's a pain to pull.... // that should be true of all of of web-pulls // cache locally, and only rebuild as required. // hrm.... MORE COMPLICATED THAT I CURRENTLY REQUIRE private List <string> GetXrmlPageLinks() { var source = WebFetch.Fetch(LibrarySource); var doc = new HtmlDocument(); doc.LoadHtml(source); var target = doc.DocumentNode.SelectNodes("//div[@class='mainlist']/div/ul/li/a"); return(target.Select(link => link.Attributes["href"].Value).ToList()); }
// TODO: build interfaces for getting all contents, random contents, and others // return as Text sources // which will have to be updated to use an optional URL instead of just file private String GetGutenberg() { var urls = GetRecentGutenbergBooks(); var rnd = new Random(); var source = string.Empty; // since a given text might not actually have a .txt.utf8 version // pull another item if we've got an empty one // TODO: we're redoing the randomization each time. // UGH. however, this should all be temporary, not part of the final product.... while (source.Length == 0) { var subUrls = urls.OrderBy(x => rnd.Next()).Take(1).ToArray <string>(); source = WebFetch.Fetch(subUrls[0]); } return(source); // in this case , it's string data }
public string Extract(string url) { var text = string.Empty; try { const string xpath = @"//div[@style='font-family: monospace;']"; var source = WebFetch.Fetch(url); var doc = new HtmlDocument(); doc.LoadHtml(source); var target = doc.DocumentNode.SelectSingleNode(xpath); text = HttpUtility.HtmlDecode(target.InnerText); } catch (Exception ex) { // TODO: log it, or something.... } return(text); }