public static GoogleScholarScrapePaperSet GenerateFromCitedBy(GoogleScholarScrapePaper gssp)
        {
            GoogleScholarScrapePaperSet gssp_set = new GoogleScholarScrapePaperSet();

            gssp_set.paper_set_source = PaperSetSource.CitedBy;
            gssp_set.search_query     = null;
            gssp_set.related_gssp     = gssp;
            gssp_set.url   = gssp.cited_by_url;
            gssp_set.gssps = GoogleScholarScraper.ScrapeUrl(gssp_set.url);

            return(gssp_set);
        }
        public static GoogleScholarScrapePaperSet GenerateFromRelatedPapers(GoogleScholarScrapePaper gssp)
        {
            GoogleScholarScrapePaperSet gssp_set = new GoogleScholarScrapePaperSet();

            gssp_set.paper_set_source = PaperSetSource.RelatedPapers;
            gssp_set.search_query     = null;
            gssp_set.related_gssp     = gssp;
            gssp_set.url   = gssp.related_articles_url;
            gssp_set.gssps = GoogleScholarScraper.ScrapeUrl(gssp_set.url);

            return(gssp_set);
        }
        /// <summary>
        /// Parses the HTML document for the relevant information.
        /// The url is that from where the document was originally downloaded - it is needed to reconstruct some of the relative links.
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="url"></param>
        /// <returns></returns>
        private static void ScrapeDoc(HtmlDocument doc, string url, List <GoogleScholarScrapePaper> gssps)
        {
            HtmlNodeCollection NoAltElements_outer = GetElementsWithClass(doc, "gs_r");

            if (null != NoAltElements_outer)
            {
                foreach (HtmlNode element in NoAltElements_outer)
                {
                    GoogleScholarScrapePaper gssp = new GoogleScholarScrapePaper();

                    string element_html = element.OuterHtml;

                    HtmlDocument item_doc = new HtmlDocument();
                    item_doc.LoadHtml(element_html);

                    HtmlNode NoAltElements = GetElementsWithClass(item_doc, "gs_r")[0];

                    var sel = GetElementsWithClass(item_doc, "gs_r");


                    sel = GetElementsWithClass(item_doc, "gs_rt");

                    var title_node = GetElementsWithClass(NoAltElements, "gs_rt");
                    if (null != title_node)
                    {
                        string title_raw = WebUtility.HtmlDecode(title_node[0].InnerText);

                        Match match = Regex.Match(title_raw, @"\[(.*)\] (.*)", RegexOptions.Singleline);
                        if (Match.Empty != match)
                        {
                            gssp.type  = match.Groups[1].Value;
                            gssp.title = match.Groups[2].Value;
                        }
                        else
                        {
                            gssp.type  = "";
                            gssp.title = title_raw;
                        }
                    }
                    else
                    {
                        Logging.Error("ScrapeDoc: unexpected structure of the Google Scholar search page snippet. Report this at https://github.com/jimmejardine/qiqqa-open-source/issues/ as it seems Google Scholar has changed its HTML output significantly. HTML:\n{0}", element_html);
                    }

                    {
                        var source_url_node = title_node[0].SelectNodes("a");
                        if (null != source_url_node)
                        {
                            gssp.source_url = WebUtility.HtmlDecode(source_url_node[0].Attributes["href"].Value);
                        }
                    }

                    {
                        var authors_node = GetElementsWithClass(NoAltElements, "gs_a");
                        if (null != authors_node)
                        {
                            gssp.authors = WebUtility.HtmlDecode(authors_node[0].InnerHtml);
                        }
                    }

                    // Pull out the abstract
                    {
                        var abstract_node = GetElementsWithClass(NoAltElements, "gs_rs");
                        if (null != abstract_node)
                        {
                            gssp.abstract_html = WebUtility.HtmlDecode(abstract_node[0].InnerText);
                        }
                    }

                    // Pull out the potential downloads
                    {
                        var downloads_node = GetElementsWithClass(NoAltElements, "gs_ggsd");  // was 'gs_md_wp gs_ttss' before.
                        if (null != downloads_node)
                        {
                            var source_url_node = downloads_node[0].SelectNodes(".//a");
                            if (null != source_url_node)
                            {
                                foreach (var child_node in source_url_node)
                                {
                                    if (null != child_node.Attributes["href"])
                                    {
                                        string download_url = child_node.Attributes["href"].Value;
                                        gssp.download_urls.Add(download_url);
                                        Logging.Info("ScrapeDoc(URL: {0}): Downloadable from {1}", url, download_url);
                                    }
                                }
                            }
                        }
                    }

                    var see_also_nodes = GetElementsWithClass(NoAltElements, "gs_fl", null, "/a");
                    GetUrlForRelatedList(url, "?cites=", "Cited by", see_also_nodes, out gssp.cited_by_header, out gssp.cited_by_url);
                    GetUrlForRelatedList(url, "?q=related:", "Related", see_also_nodes, out gssp.related_articles_header, out gssp.related_articles_url);
                    GetUrlForRelatedList(url, "scholar.bib?q=info:", "Import into BibTeX", see_also_nodes, out gssp.bibtex_header, out gssp.bibtex_url);

                    gssps.Add(gssp);
                }
            }
        }
Пример #4
0
        /// <summary>
        /// Parses the HTML document for the relevant information.
        /// The url is that from where the document was originally downloaded - it is needed to reconstruct some of the relative links.
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="url"></param>
        /// <returns></returns>
        private static void ScrapeDoc(HtmlDocument doc, string url, List <GoogleScholarScrapePaper> gssps)
        {
            HtmlNodeCollection NoAltElements_outer = doc.DocumentNode.SelectNodes("//*[@class='gs_r']");

            if (null != NoAltElements_outer)
            {
                foreach (HtmlNode element in NoAltElements_outer)
                {
                    GoogleScholarScrapePaper gssp = new GoogleScholarScrapePaper();

                    string element_html = element.OuterHtml;

                    HtmlDocument item_doc = new HtmlDocument();
                    item_doc.LoadHtml(element_html);

                    HtmlNode NoAltElements = item_doc.DocumentNode.SelectNodes("//*[@class='gs_r']")[0];

                    var title_node = NoAltElements.SelectNodes("//*[@class='gs_rt']");
                    {
                        string title_raw = WebUtility.HtmlDecode(title_node[0].InnerText);

                        Match match = Regex.Match(title_raw, @"\[(.*)\] (.*)", RegexOptions.Singleline);
                        if (Match.Empty != match)
                        {
                            gssp.type  = match.Groups[1].Value;
                            gssp.title = match.Groups[2].Value;
                        }
                        else
                        {
                            gssp.type  = "";
                            gssp.title = title_raw;
                        }
                    }

                    {
                        var source_url_node = title_node[0].SelectNodes("a");
                        if (null != source_url_node)
                        {
                            gssp.source_url = WebUtility.HtmlDecode(source_url_node[0].Attributes["href"].Value);
                        }
                    }

                    {
                        var authors_node = NoAltElements.SelectNodes("//*[@class='gs_a']");
                        if (null != authors_node)
                        {
                            gssp.authors = WebUtility.HtmlDecode(authors_node[0].InnerHtml);
                        }
                    }

                    // Pull out the abstract
                    {
                        var abstract_node = NoAltElements.SelectNodes("//*[@class='gs_rs']");
                        if (null != abstract_node)
                        {
                            gssp.abstract_html = WebUtility.HtmlDecode(abstract_node[0].InnerText);
                        }
                    }

                    // Pull out the potential downloads
                    {
                        var downloads_node = NoAltElements.SelectNodes("//*[@class='gs_md_wp gs_ttss']");
                        if (null != downloads_node)
                        {
                            foreach (var child_node in downloads_node[0].ChildNodes)
                            {
                                if ("a" == child_node.Name)
                                {
                                    string download_url = child_node.Attributes["href"].Value;
                                    gssp.download_urls.Add(download_url);
                                    Logging.Info("Downloadable from {0}", download_url);
                                }
                            }
                        }
                    }

                    var see_also_nodes = NoAltElements.SelectNodes("//*[@class='gs_fl']/a");
                    GetUrlForRelatedList(url, "Cited by", see_also_nodes, out gssp.cited_by_header, out gssp.cited_by_url);
                    GetUrlForRelatedList(url, "Related", see_also_nodes, out gssp.related_articles_header, out gssp.related_articles_url);
                    GetUrlForRelatedList(url, "Import into BibTeX", see_also_nodes, out gssp.bibtex_header, out gssp.bibtex_url);

                    gssps.Add(gssp);
                }
            }
        }
Пример #5
0
        /// <summary>
        /// Parses the HTML document for the relevant information.
        /// The url is that from where the document was originally downloaded - it is needed to reconstruct some of the relative links.
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="url"></param>
        /// <returns></returns>
        protected static void ScrapeDoc(HtmlDocument doc, string url, ref List <GoogleScholarScrapePaper> gssps)
        {
            HtmlNodeCollection NoAltElements_outer = GetElementsWithClass(doc, "gs_r");

            if (null != NoAltElements_outer)
            {
                foreach (HtmlNode element in NoAltElements_outer)
                {
                    GoogleScholarScrapePaper gssp = new GoogleScholarScrapePaper();

                    string element_html = element.OuterHtml;

                    HtmlDocument item_doc = new HtmlDocument();
                    item_doc.LoadHtml(element_html);

                    HtmlNode NoAltElements = GetElementsWithClass(item_doc, "gs_r")[0];

#if false
                    var sel = GetElementsWithClass(item_doc, "gs_r");
                    sel = GetElementsWithClass(item_doc, "gs_rt");
#endif

                    var title_node = GetElementsWithClass(NoAltElements, "gs_rt");
                    if (null != title_node)
                    {
                        string title_raw = WebUtility.HtmlDecode(title_node[0].InnerText);

                        // Anno 2020, Google Scholar has the type duplicated in *two* spans: we only extract the first of those.
                        Match match = Regex.Match(title_raw, @"\[(.*?)(\][^\]]+)?\] (.*)", RegexOptions.Singleline);
                        if (Match.Empty != match)
                        {
                            gssp.type  = match.Groups[1].Value;
                            gssp.title = match.Groups[3].Value;
                        }
                        else
                        {
                            gssp.type  = "";
                            gssp.title = title_raw;
                        }

                        {
                            var source_url_node = title_node[0].SelectNodes("a");
                            if (null != source_url_node)
                            {
                                gssp.source_url = WebUtility.HtmlDecode(source_url_node[0].Attributes["href"].Value);
                            }
                        }

                        {
                            var authors_node = GetElementsWithClass(NoAltElements, "gs_a");
                            if (null != authors_node)
                            {
                                gssp.authors = WebUtility.HtmlDecode(authors_node[0].InnerHtml);
                            }
                        }

                        // Pull out the abstract
                        {
                            var abstract_node = GetElementsWithClass(NoAltElements, "gs_rs");
                            if (null != abstract_node)
                            {
                                gssp.abstract_html = WebUtility.HtmlDecode(abstract_node[0].InnerText);
                            }
                        }

                        // Pull out the potential downloads
                        {
                            var downloads_node = GetElementsWithClass(NoAltElements, "gs_ggsd");  // was 'gs_md_wp gs_ttss' before.
                            if (null != downloads_node)
                            {
                                var source_url_node = downloads_node[0].SelectNodes(".//a");
                                if (null != source_url_node)
                                {
                                    foreach (var child_node in source_url_node)
                                    {
                                        if (null != child_node.Attributes["href"])
                                        {
                                            string download_url = child_node.Attributes["href"].Value;
                                            gssp.download_urls.Add(download_url);
                                            Logging.Info("ScrapeDoc(URL: {0}): Downloadable from {1}", url, download_url);
                                        }
                                    }
                                }
                            }
                        }

                        var see_also_nodes = GetElementsWithClass(NoAltElements, "gs_fl", null, "/a");
                        if (see_also_nodes != null)
                        {
                            GetUrlForRelatedList(url, "?cites=", "Cited by", see_also_nodes, out gssp.cited_by_header, out gssp.cited_by_url);
                            GetUrlForRelatedList(url, "?q=related:", "Related", see_also_nodes, out gssp.related_articles_header, out gssp.related_articles_url);
                            GetUrlForRelatedList(url, "scholar.bib?q=info:", "Import into BibTeX", see_also_nodes, out gssp.bibtex_header, out gssp.bibtex_url);
                        }

                        gssps.Add(gssp);
                    }
                    else if (NoAltElements.FirstChild != null && NoAltElements.FirstChild.Name == "a" && (NoAltElements.FirstChild.Attributes["href"]?.Value ?? "").Contains("scholar_alerts?"))
                    {
                        // ignore Google Scholar alerts blurb
                    }
                    else if (NoAltElements.FirstChild != null && NoAltElements.FirstChild.Name == "#text")
                    {
                        // ignore Google Scholar "best result for this search query" blurb
                    }
                    else if ((NoAltElements.Attributes["class"]?.Value ?? "").Contains("gs_qsuggest"))
                    {
                        // ignore Google Scholar "related searches for this search query" blurb
                    }
                    else
                    {
                        Logging.Error("ScrapeDoc: unexpected structure of the Google Scholar search page snippet. Report this at https://github.com/jimmejardine/qiqqa-open-source/issues/ as it seems Google Scholar has changed its HTML output significantly. HTML:\n{0}", element_html);
                    }
                }
            }
        }