// JOURNAL SEARCH RESULTS public SG.Journal getJournals(string journalName, string ISSN, string keywords, ref string next_url) { // CONNECTIONS if (journalName == null) journalName = null; else journalName.Trim(); SG.Journal result = new SG.Journal(journalName); // FORMATTIONG STRINGS if (ISSN == null) ISSN = ""; else { ISSN = ISSN.Trim(); ISSN = Regex.Replace(ISSN, @"\s+", "+"); } if (keywords == null) keywords = ""; else { keywords = keywords.Trim(); keywords = Regex.Replace(keywords, @"\s+", "+"); } string temp = (ISSN.Equals("") || keywords.Equals("")) ? keywords + ISSN : keywords + "+" + ISSN; string name = Regex.Replace(journalName, @"\s+", "+"); string url = "http://scholar.google.com/scholar?as_q=&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=&as_publication=" + journalName + "&as_ylo=&as_yhi=&btnG=&hl=en&as_sdt=0,5&num=20"; //string url = "http://scholar.google.co.in/scholar?hl=en&q=anil+kumar&btnG=&as_sdt=1,5&as_sdtp="; Console.WriteLine(url); //Console.WriteLine("loaded !!!"); HtmlWeb web = new HtmlWeb(); try { doc = web.Load(url); } catch (Exception e) { return null; } //Console.WriteLine(doc.DocumentNode.InnerHtml); //string html = client.DownloadString(url); //doc.LoadHtml(html); string xpath = "//div[@class=\"gs_ri\"]"; string title, titleLink, authors, publication, publisher, cited_by_url, summary; int year, rank = 1, no_of_citations; HtmlNodeCollection searchResults = doc.DocumentNode.SelectNodes(xpath); if (searchResults == null) { if (checkForCaptcha()) { Console.WriteLine("Captcha problem ..."); return null; } Console.WriteLine("No results ..."); return result; } else { foreach (HtmlNode n in searchResults) { // TITLE AND TITLE LINK HtmlNode child = n.SelectSingleNode(".//*[@class=\"gs_rt\"]"); title = child.InnerText; titleLink = ""; HtmlNode url_node = child.SelectSingleNode(".//a"); if (url_node != null) { titleLink = url_node.GetAttributeValue("href", ""); if (!titleLink.Equals("")) { //titleLink = "http://scholar.google.com" + titleLink; titleLink = titleLink.Replace("amp;", ""); } } // AUTHORS AND PUBLICATION child = n.SelectSingleNode(".//*[@class=\"gs_a\"]"); authors = ""; publication = ""; publisher = ""; year = -1; if (child != null) { string[] names = child.InnerText.Split('-'); if (names.Length == 1) authors = names[0]; else if (names.Length == 2) { authors = names[0]; bool flag = false; names[1].Trim(); try { year = Convert.ToInt32(names[1]); } catch (FormatException fe) { flag = true; } if (flag) { string[] p = names[1].Split(','); try { year = Convert.ToInt32(p[1]); } catch (Exception e) { } publication = p[0]; } } else { authors = names[0]; publisher = names[2]; bool flag = false; names[1].Trim(); try { year = Convert.ToInt32(names[1]); } catch (FormatException fe) { flag = true; } if (flag) { string[] p = names[1].Split(','); try { year = Convert.ToInt32(p[1]); } catch (Exception e) { } publication = p[0]; } } } // SUMMARY child = n.SelectSingleNode(".//*[@class=\"gs_rs\"]"); summary = ""; if (child != null) { summary = child.InnerText; } // CITATION STUFF no_of_citations = 0; cited_by_url = ""; child = n.SelectSingleNode(".//*[@class=\"gs_fl\"]"); if (child != null) child = child.FirstChild; if (child != null) { string text = child.InnerText; try { text = text.Replace("Cited by", ""); text = text.Trim(); no_of_citations = Convert.ToInt32(text); } catch (Exception e) { } cited_by_url = no_of_citations != 0 ? child.GetAttributeValue("href", "") : ""; if (!cited_by_url.Equals("")) { cited_by_url = "http://scholar.google.com" + cited_by_url; cited_by_url = cited_by_url.Replace("amp;", ""); } } SG.Paper paper = new SG.Paper(title, titleLink, authors, summary, year, publication, publisher, no_of_citations, cited_by_url, rank); result.addPaper(paper); rank++; } //NEXT PAGE URL HtmlNode bottom = doc.DocumentNode.SelectSingleNode(".//*[@id=\"gs_n\"]//table//td[@align=\"left\"]//a"); if (bottom != null) { url = bottom.GetAttributeValue("href", ""); if (!url.Equals("")) { url = "http://scholar.google.com" + url; url = url.Replace("amp;", ""); next_url = url; } else next_url = null; } else next_url = null; } return result; }
// GET CITATIONS NEXT PAGE public bool? getCitationsNextPage(string url, ref string next_url, ref List<SG.Paper> papers) { setts = setRecords.ReadSettings(); int maxResults = setts.GSMaxResults; int num = papers.Count; if (num >= maxResults) return false; // CONNECTIONS if (url == null) return false; HtmlWeb web = new HtmlWeb(); try { doc = web.Load(url); } catch (Exception e) { return null; } List<SG.Paper> results = new List<SG.Paper>(); string xpath = "//div[@class=\"gs_ri\"]"; string title, titleLink, authors, publication, publisher, cited_by_url, summary; int year, rank = 1, no_of_citations; HtmlNodeCollection searchResults = doc.DocumentNode.SelectNodes(xpath); if (searchResults == null) { if (checkForCaptcha()) { Console.WriteLine("Captcha problem ..."); return null; } Console.WriteLine("No results ..."); return false; } else { foreach (HtmlNode n in searchResults) { // TITLE AND TITLE LINK HtmlNode child = n.SelectSingleNode(".//*[@class=\"gs_rt\"]"); title = child.InnerText; titleLink = ""; HtmlNode url_node = child.SelectSingleNode(".//a"); if (url_node != null) { titleLink = url_node.GetAttributeValue("href", ""); if (!titleLink.Equals("")) { //titleLink = "http://scholar.google.com" + titleLink; titleLink = titleLink.Replace("amp;", ""); } } // AUTHORS AND PUBLICATION child = n.SelectSingleNode(".//*[@class=\"gs_a\"]"); authors = ""; publication = ""; publisher = ""; year = 1970; if (child != null) { string[] names = child.InnerText.Split('-'); if (names.Length == 1) authors = names[0]; else if (names.Length == 2) { authors = names[0]; bool flag = false; names[1].Trim(); try { year = Convert.ToInt32(names[1]); } catch (FormatException fe) { flag = true; } if (flag) { string[] p = names[1].Split(','); try { year = Convert.ToInt32(p[1]); } catch (Exception e) { } publication = p[0]; } } else { authors = names[0]; publisher = names[2]; bool flag = false; names[1].Trim(); try { year = Convert.ToInt32(names[1]); } catch (FormatException fe) { flag = true; } if (flag) { string[] p = names[1].Split(','); try { year = Convert.ToInt32(p[1]); } catch (Exception e) { } publication = p[0]; } } } // SUMMARY child = n.SelectSingleNode(".//*[@class=\"gs_rs\"]"); summary = ""; if (child != null) { summary = child.InnerText; } // CITATION STUFF no_of_citations = 0; cited_by_url = ""; child = n.SelectSingleNode(".//*[@class=\"gs_fl\"]"); if (child != null) child = child.FirstChild; if (child != null) { string text = child.InnerText; cited_by_url = child.GetAttributeValue("href", ""); if (!cited_by_url.Equals("")) { cited_by_url = "http://scholar.google.com" + cited_by_url; cited_by_url = cited_by_url.Replace("amp;", ""); } try { text = text.Replace("Cited by", ""); text = text.Trim(); no_of_citations = Convert.ToInt32(text); } catch (Exception e) { } } if (num == maxResults) return false; SG.Paper paper = new SG.Paper(title, titleLink, authors, summary, year, publication, publisher, no_of_citations, cited_by_url, rank); results.Add(paper); num++; rank++; } //NEXT PAGE URL HtmlNode bottom = doc.DocumentNode.SelectSingleNode(".//*[@id=\"gs_n\"]//table//td[@align=\"left\"]//a"); if (bottom != null) { url = bottom.GetAttributeValue("href", ""); if (!url.Equals("")) { url = "http://scholar.google.com" + url; url = url.Replace("amp;", ""); next_url = url; } else next_url = null; } else next_url = null; } foreach (SG.Paper p in results) papers.Add(p); return true; }