public static GoogleScholarScrapePaperSet GenerateFromCitedBy(GoogleScholarScrapePaper gssp) { GoogleScholarScrapePaperSet gssp_set = new GoogleScholarScrapePaperSet(); gssp_set.paper_set_source = PaperSetSource.CitedBy; gssp_set.search_query = null; gssp_set.related_gssp = gssp; gssp_set.url = gssp.cited_by_url; gssp_set.gssps = GoogleScholarScraper.ScrapeUrl(gssp_set.url); return(gssp_set); }
public static GoogleScholarScrapePaperSet GenerateFromRelatedPapers(GoogleScholarScrapePaper gssp) { GoogleScholarScrapePaperSet gssp_set = new GoogleScholarScrapePaperSet(); gssp_set.paper_set_source = PaperSetSource.RelatedPapers; gssp_set.search_query = null; gssp_set.related_gssp = gssp; gssp_set.url = gssp.related_articles_url; gssp_set.gssps = GoogleScholarScraper.ScrapeUrl(gssp_set.url); return(gssp_set); }
/// <summary> /// Parses the HTML document for the relevant information. /// The url is that from where the document was originally downloaded - it is needed to reconstruct some of the relative links. /// </summary> /// <param name="doc"></param> /// <param name="url"></param> /// <returns></returns> private static void ScrapeDoc(HtmlDocument doc, string url, List <GoogleScholarScrapePaper> gssps) { HtmlNodeCollection NoAltElements_outer = GetElementsWithClass(doc, "gs_r"); if (null != NoAltElements_outer) { foreach (HtmlNode element in NoAltElements_outer) { GoogleScholarScrapePaper gssp = new GoogleScholarScrapePaper(); string element_html = element.OuterHtml; HtmlDocument item_doc = new HtmlDocument(); item_doc.LoadHtml(element_html); HtmlNode NoAltElements = GetElementsWithClass(item_doc, "gs_r")[0]; var sel = GetElementsWithClass(item_doc, "gs_r"); sel = GetElementsWithClass(item_doc, "gs_rt"); var title_node = GetElementsWithClass(NoAltElements, "gs_rt"); if (null != title_node) { string title_raw = WebUtility.HtmlDecode(title_node[0].InnerText); Match match = Regex.Match(title_raw, @"\[(.*)\] (.*)", RegexOptions.Singleline); if (Match.Empty != match) { gssp.type = match.Groups[1].Value; gssp.title = match.Groups[2].Value; } else { gssp.type = ""; gssp.title = title_raw; } } else { Logging.Error("ScrapeDoc: unexpected structure of the Google Scholar search page snippet. Report this at https://github.com/jimmejardine/qiqqa-open-source/issues/ as it seems Google Scholar has changed its HTML output significantly. HTML:\n{0}", element_html); } { var source_url_node = title_node[0].SelectNodes("a"); if (null != source_url_node) { gssp.source_url = WebUtility.HtmlDecode(source_url_node[0].Attributes["href"].Value); } } { var authors_node = GetElementsWithClass(NoAltElements, "gs_a"); if (null != authors_node) { gssp.authors = WebUtility.HtmlDecode(authors_node[0].InnerHtml); } } // Pull out the abstract { var abstract_node = GetElementsWithClass(NoAltElements, "gs_rs"); if (null != abstract_node) { gssp.abstract_html = WebUtility.HtmlDecode(abstract_node[0].InnerText); } } // Pull out the potential downloads { var downloads_node = GetElementsWithClass(NoAltElements, "gs_ggsd"); // was 'gs_md_wp gs_ttss' before. if (null != downloads_node) { var source_url_node = downloads_node[0].SelectNodes(".//a"); if (null != source_url_node) { foreach (var child_node in source_url_node) { if (null != child_node.Attributes["href"]) { string download_url = child_node.Attributes["href"].Value; gssp.download_urls.Add(download_url); Logging.Info("ScrapeDoc(URL: {0}): Downloadable from {1}", url, download_url); } } } } } var see_also_nodes = GetElementsWithClass(NoAltElements, "gs_fl", null, "/a"); GetUrlForRelatedList(url, "?cites=", "Cited by", see_also_nodes, out gssp.cited_by_header, out gssp.cited_by_url); GetUrlForRelatedList(url, "?q=related:", "Related", see_also_nodes, out gssp.related_articles_header, out gssp.related_articles_url); GetUrlForRelatedList(url, "scholar.bib?q=info:", "Import into BibTeX", see_also_nodes, out gssp.bibtex_header, out gssp.bibtex_url); gssps.Add(gssp); } } }
/// <summary> /// Parses the HTML document for the relevant information. /// The url is that from where the document was originally downloaded - it is needed to reconstruct some of the relative links. /// </summary> /// <param name="doc"></param> /// <param name="url"></param> /// <returns></returns> private static void ScrapeDoc(HtmlDocument doc, string url, List <GoogleScholarScrapePaper> gssps) { HtmlNodeCollection NoAltElements_outer = doc.DocumentNode.SelectNodes("//*[@class='gs_r']"); if (null != NoAltElements_outer) { foreach (HtmlNode element in NoAltElements_outer) { GoogleScholarScrapePaper gssp = new GoogleScholarScrapePaper(); string element_html = element.OuterHtml; HtmlDocument item_doc = new HtmlDocument(); item_doc.LoadHtml(element_html); HtmlNode NoAltElements = item_doc.DocumentNode.SelectNodes("//*[@class='gs_r']")[0]; var title_node = NoAltElements.SelectNodes("//*[@class='gs_rt']"); { string title_raw = WebUtility.HtmlDecode(title_node[0].InnerText); Match match = Regex.Match(title_raw, @"\[(.*)\] (.*)", RegexOptions.Singleline); if (Match.Empty != match) { gssp.type = match.Groups[1].Value; gssp.title = match.Groups[2].Value; } else { gssp.type = ""; gssp.title = title_raw; } } { var source_url_node = title_node[0].SelectNodes("a"); if (null != source_url_node) { gssp.source_url = WebUtility.HtmlDecode(source_url_node[0].Attributes["href"].Value); } } { var authors_node = NoAltElements.SelectNodes("//*[@class='gs_a']"); if (null != authors_node) { gssp.authors = WebUtility.HtmlDecode(authors_node[0].InnerHtml); } } // Pull out the abstract { var abstract_node = NoAltElements.SelectNodes("//*[@class='gs_rs']"); if (null != abstract_node) { gssp.abstract_html = WebUtility.HtmlDecode(abstract_node[0].InnerText); } } // Pull out the potential downloads { var downloads_node = NoAltElements.SelectNodes("//*[@class='gs_md_wp gs_ttss']"); if (null != downloads_node) { foreach (var child_node in downloads_node[0].ChildNodes) { if ("a" == child_node.Name) { string download_url = child_node.Attributes["href"].Value; gssp.download_urls.Add(download_url); Logging.Info("Downloadable from {0}", download_url); } } } } var see_also_nodes = NoAltElements.SelectNodes("//*[@class='gs_fl']/a"); GetUrlForRelatedList(url, "Cited by", see_also_nodes, out gssp.cited_by_header, out gssp.cited_by_url); GetUrlForRelatedList(url, "Related", see_also_nodes, out gssp.related_articles_header, out gssp.related_articles_url); GetUrlForRelatedList(url, "Import into BibTeX", see_also_nodes, out gssp.bibtex_header, out gssp.bibtex_url); gssps.Add(gssp); } } }
/// <summary> /// Parses the HTML document for the relevant information. /// The url is that from where the document was originally downloaded - it is needed to reconstruct some of the relative links. /// </summary> /// <param name="doc"></param> /// <param name="url"></param> /// <returns></returns> protected static void ScrapeDoc(HtmlDocument doc, string url, ref List <GoogleScholarScrapePaper> gssps) { HtmlNodeCollection NoAltElements_outer = GetElementsWithClass(doc, "gs_r"); if (null != NoAltElements_outer) { foreach (HtmlNode element in NoAltElements_outer) { GoogleScholarScrapePaper gssp = new GoogleScholarScrapePaper(); string element_html = element.OuterHtml; HtmlDocument item_doc = new HtmlDocument(); item_doc.LoadHtml(element_html); HtmlNode NoAltElements = GetElementsWithClass(item_doc, "gs_r")[0]; #if false var sel = GetElementsWithClass(item_doc, "gs_r"); sel = GetElementsWithClass(item_doc, "gs_rt"); #endif var title_node = GetElementsWithClass(NoAltElements, "gs_rt"); if (null != title_node) { string title_raw = WebUtility.HtmlDecode(title_node[0].InnerText); // Anno 2020, Google Scholar has the type duplicated in *two* spans: we only extract the first of those. Match match = Regex.Match(title_raw, @"\[(.*?)(\][^\]]+)?\] (.*)", RegexOptions.Singleline); if (Match.Empty != match) { gssp.type = match.Groups[1].Value; gssp.title = match.Groups[3].Value; } else { gssp.type = ""; gssp.title = title_raw; } { var source_url_node = title_node[0].SelectNodes("a"); if (null != source_url_node) { gssp.source_url = WebUtility.HtmlDecode(source_url_node[0].Attributes["href"].Value); } } { var authors_node = GetElementsWithClass(NoAltElements, "gs_a"); if (null != authors_node) { gssp.authors = WebUtility.HtmlDecode(authors_node[0].InnerHtml); } } // Pull out the abstract { var abstract_node = GetElementsWithClass(NoAltElements, "gs_rs"); if (null != abstract_node) { gssp.abstract_html = WebUtility.HtmlDecode(abstract_node[0].InnerText); } } // Pull out the potential downloads { var downloads_node = GetElementsWithClass(NoAltElements, "gs_ggsd"); // was 'gs_md_wp gs_ttss' before. if (null != downloads_node) { var source_url_node = downloads_node[0].SelectNodes(".//a"); if (null != source_url_node) { foreach (var child_node in source_url_node) { if (null != child_node.Attributes["href"]) { string download_url = child_node.Attributes["href"].Value; gssp.download_urls.Add(download_url); Logging.Info("ScrapeDoc(URL: {0}): Downloadable from {1}", url, download_url); } } } } } var see_also_nodes = GetElementsWithClass(NoAltElements, "gs_fl", null, "/a"); if (see_also_nodes != null) { GetUrlForRelatedList(url, "?cites=", "Cited by", see_also_nodes, out gssp.cited_by_header, out gssp.cited_by_url); GetUrlForRelatedList(url, "?q=related:", "Related", see_also_nodes, out gssp.related_articles_header, out gssp.related_articles_url); GetUrlForRelatedList(url, "scholar.bib?q=info:", "Import into BibTeX", see_also_nodes, out gssp.bibtex_header, out gssp.bibtex_url); } gssps.Add(gssp); } else if (NoAltElements.FirstChild != null && NoAltElements.FirstChild.Name == "a" && (NoAltElements.FirstChild.Attributes["href"]?.Value ?? "").Contains("scholar_alerts?")) { // ignore Google Scholar alerts blurb } else if (NoAltElements.FirstChild != null && NoAltElements.FirstChild.Name == "#text") { // ignore Google Scholar "best result for this search query" blurb } else if ((NoAltElements.Attributes["class"]?.Value ?? "").Contains("gs_qsuggest")) { // ignore Google Scholar "related searches for this search query" blurb } else { Logging.Error("ScrapeDoc: unexpected structure of the Google Scholar search page snippet. Report this at https://github.com/jimmejardine/qiqqa-open-source/issues/ as it seems Google Scholar has changed its HTML output significantly. HTML:\n{0}", element_html); } } } }