/* * actual parsing of xml */ public String[] getMapFromRSSXml(string state, string woeid) { String[] returnMap = new String[Int32.Parse(config.Configuration["TOTALPARAMS"])]; currUrl = rssURL + "?w=" + woeid + "&u=" + config.Configuration["TEMPTYPE"]; string xmlForCurrWoeid = yScraper.getDataFromURL(currUrl); string city, temp, lu, tt, title, pd, ctext, ctemp; string cityXPath = "/rss/channel/"; string allXPath = "/rss/channel/item/"; try { city = parseXMLGetValue(xmlForCurrWoeid, cityXPath, "title").Split('-')[1].Split(',')[0].Trim(); temp = parseXMLGetValueWithAttr(xmlForCurrWoeid, allXPath, "yweather:condition", "temp"); ctext = parseXMLGetValueWithAttr(xmlForCurrWoeid, allXPath, "yweather:condition", "text"); ctemp = parseXMLGetValueWithAttr(xmlForCurrWoeid, allXPath, "yweather:condition", "temp"); lu = DateTime.Now.ToString(@"M/d/yyyy hh:mm:ss tt"); tt = config.Configuration["TEMPTYPE"]; title = parseXMLGetValue(xmlForCurrWoeid, allXPath, "title"); pd = parseXMLGetValue(xmlForCurrWoeid, allXPath, "pubDate"); returnMap[0] = woeid; returnMap[1] = config.Configuration["COUNTRY"]; returnMap[2] = state; returnMap[3] = city; returnMap[4] = temp; returnMap[5] = lu; returnMap[6] = tt; returnMap[7] = title; returnMap[8] = pd; returnMap[9] = ctext; returnMap[10] = ctemp; } catch (Exception e) { Console.WriteLine("Unable to fetch for : " + currUrl + " :: WOEID = " + woeid + "\n"); } return(returnMap); }
// loads cities using scraping method public bool loadCities() { /* * Here we performe scraping operation and then parse the HTML data using regex * Note: alternative for parsing can be usage of HTML Agility Pack[open source]. */ try { Console.WriteLine("Loading all the States and Cities..."); string statesString = yScraper.getDataFromURL(mainURL); //use regex to get the data between HTML tags that hold all the states string strRegex = @"<div class=""yom-mod(.+?)div id=""reg-pg"">"; Regex myRegex = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.ECMAScript | RegexOptions.CultureInvariant); String excerpt = (myRegex.Match(statesString).Groups[1].Value); //again use regex to extract only the states [~along with 2 or 3 extra sentences] strRegex = @"(?<=^|>)[^><]+?(?=<|$)"; Regex myRegexStates = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline); MatchCollection matches = myRegexStates.Matches(@excerpt); // now we need to perform scraping once more to gather all the cities and their woeids // procedure is kind of similar int tempCount = 0; double percentDone = 0;// for accessing the match collection from right position. foreach (Match m in matches) { tempCount++; if (tempCount > startCountForStates) { percentDone += 1; Console.Write("\r" + ((int)((percentDone / (double)(matches.Count - startCountForStates)) * 100)) + "% done"); string citiesString = yScraper.getDataFromURL(mainURL + "" + m.ToString()); String strRegexCities = @"<div class=""yom-mod yom-weather-region""(.+?)<div id=""reg-pg"">"; Regex myRegexCities = new Regex(strRegexCities, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.ECMAScript | RegexOptions.CultureInvariant); String excerptCities = (myRegexCities.Match(citiesString).Groups[1].Value); strRegexCities = @"<a href=""/" + config.Configuration["COUNTRY"].ToLower() + "/" + m.ToString().Replace(" ", "-") + @"/(.+?)/"">"; Regex myRegexStatesCities = new Regex(strRegexCities, RegexOptions.IgnoreCase | RegexOptions.Singleline); MatchCollection matchesCities = myRegexStatesCities.Matches(excerptCities); String tempStr = ""; foreach (Match mm in matchesCities) { int i = mm.ToString().LastIndexOf("/"); int j = mm.ToString().Substring(0, i - 1).LastIndexOf("/"); string str = mm.ToString().Substring(j + 1, (i - j) - 1); tempStr += str + ","; } tempStr = tempStr.Substring(0, tempStr.Length - 1); // add cities and states to dictionary for future use statesToCitiesMap.Add(m.ToString(), tempStr); } } return(true); } catch (Exception e) { Console.WriteLine("Error occured : \n" + e); } return(false); }