Example #1
0
        public List <ScholarArticle> GetScholarArticlesByQuery(string query)
        {
            // getting page content from scholar page (with given query)
            query = GetQueryUrl(query);
            string pageContent = GetPageContent(query);

            // creating list of articles for "searhc on scholar" view
            List <ScholarArticle> scholarArticles = new List <ScholarArticle>();
            // creating list of articles to operate
            List <Article> articles = new List <Article>();

            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(pageContent);

            for (int i = 1; i <= 10; i++)
            {
                ScholarArticle article          = new ScholarArticle();
                string         xPathBiblioCheck = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div[2]", i);
                //*[@id="gs_ccl_results_results"]/div[1]
                HtmlNode biblioCheck = doc.DocumentNode.SelectSingleNode(xPathBiblioCheck);
                if (biblioCheck != null)
                {
                    string   xPathRefCheck = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div[2]/h3/a", i);
                    HtmlNode refCheck      = doc.DocumentNode.SelectSingleNode(xPathRefCheck);

                    // adding title of article
                    string xPathTitle = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div[2]/h3/a", i);
                    article.Title = doc.DocumentNode.SelectSingleNode(xPathTitle).InnerText;

                    // adding info
                    string xPathInfo = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div[2]/div[1]", i);
                    article.Info = doc.DocumentNode.SelectSingleNode(xPathInfo).InnerText;

                    // adding reference
                    article.Reference = refCheck.GetAttributeValue("href", null);

                    // adding citiations amount
                    string xPathCitiations = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div[2]/div[3]/a[1]", i);
                    string citiationsCheck = doc.DocumentNode.SelectSingleNode(xPathCitiations).InnerText;
                    if (citiationsCheck.StartsWith("Cited by"))
                    {
                        article.Citiations = citiationsCheck;
                    }
                    else
                    {
                        article.Citiations = "No citiations for this article. ";
                    }

                    scholarArticles.Insert(i - 1, article);
                }
                else
                {
                    string   xPathRefCheck = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/h3/a", i);
                    HtmlNode refCheck      = doc.DocumentNode.SelectSingleNode(xPathRefCheck);
                    if (refCheck != null)
                    {
                        string xPathTitle = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/h3/a", i);
                        article.Title = doc.DocumentNode.SelectSingleNode(xPathTitle).InnerText;

                        string xPathInfo = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/div[1]", i);
                        article.Info = doc.DocumentNode.SelectSingleNode(xPathInfo).InnerText;

                        article.Reference = refCheck.GetAttributeValue("href", null);

                        string xPathCitiations = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/div[3]/a[1]", i);
                        string citiationsCheck = doc.DocumentNode.SelectSingleNode(xPathCitiations).InnerText;
                        if (citiationsCheck.StartsWith("Cited by"))
                        {
                            article.Citiations = citiationsCheck;
                        }
                        else
                        {
                            article.Citiations = "No citiations for this article. ";
                        }

                        scholarArticles.Insert(i - 1, article);
                    }

                    else
                    {
                        // case, when article do not has reference, but has a tag [citiation]/[book]
                        string   xPathTitleCheck = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/h3", i);
                        string   xPathSpanNode   = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/h3/span", i);
                        HtmlNode titleMatchNode  = doc.DocumentNode.SelectSingleNode(xPathTitleCheck);
                        HtmlNode spanNode        = doc.DocumentNode.SelectSingleNode(xPathSpanNode);
                        titleMatchNode.RemoveChild(spanNode);
                        article.Title = titleMatchNode.InnerText;

                        string xPathInfo = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/div[1]", i);
                        article.Info = doc.DocumentNode.SelectSingleNode(xPathInfo).InnerText;

                        article.Reference = "This article does not have a reference";

                        string xPathCitiations = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/div[2]/a[1]", i);
                        string citiationsCheck = doc.DocumentNode.SelectSingleNode(xPathCitiations).InnerText;
                        if (citiationsCheck.StartsWith("Cited by"))
                        {
                            article.Citiations = citiationsCheck;
                        }
                        else
                        {
                            article.Citiations = "No citiations for this article. ";
                        }

                        scholarArticles.Insert(i - 1, article);
                    }
                }
            }

            return(scholarArticles);
        }
        /// <summary>
        /// Main class method
        /// </summary>
        /// <param name="query"> String from text box </param>
        /// <param name="page"> Number of page to find </param>
        /// <param name="exactPhrase"> Article should contains this phrase </param>
        /// <param name="without"> Articles should not contains this words </param>
        /// <param name="head"> Is searching only in article head </param>
        /// <param name="published"> Journal, where the article was published </param>
        /// <param name="author"> Author of article </param>
        /// <param name="dateStart"> Since date </param>
        /// <param name="dateEnd"> Till date </param>
        /// <returns> List of articles from Google.Scholar </returns>
        public List <ScholarArticle> GetScholarArticlesByQuery(string query,
                                                               int page,
                                                               string exactPhrase = null,
                                                               string without     = null,
                                                               bool head          = false,
                                                               string published   = null,
                                                               string author      = null,
                                                               int dateStart      = int.MinValue,
                                                               int dateEnd        = int.MinValue)
        {
            // getting page content from scholar page (with given query)
            query = GetQueryUrl(query, page, exactPhrase, without, head, published, author, dateStart, dateEnd);
            var pageContent = GetPageContent(query);

            // creating list of articles for "search on scholar" view
            var scholarArticles = new List <ScholarArticle>();

            var doc = new HtmlDocument();

            doc.LoadHtml(pageContent);
            for (var i = 1; i <= 11; i++)
            {
                var article          = new ScholarArticle();
                var xPathBiblioCheck = $"//*[@id='gs_ccl_results']/div[{i}]/div[2]";
                //*[@id="gs_ccl_results_results"]/div[1]
                var biblioCheck = doc.DocumentNode.SelectSingleNode(xPathBiblioCheck);
                if (biblioCheck != null)
                {
                    var xPathRefCheck = $"//*[@id='gs_ccl_results']/div[{i}]/div[2]/h3/a";
                    var refCheck      = doc.DocumentNode.SelectSingleNode(xPathRefCheck);

                    // adding title of article
                    var xPathTitle = $"//*[@id='gs_ccl_results']/div[{i}]/div[2]/h3/a";
                    article.Title = doc.DocumentNode.SelectSingleNode(xPathTitle).InnerText;

                    // adding info
                    var xPathInfo = $"//*[@id='gs_ccl_results']/div[{i}]/div[2]/div[1]";
                    article.Info = doc.DocumentNode.SelectSingleNode(xPathInfo).InnerText;

                    // adding reference
                    article.Reference = refCheck.GetAttributeValue("href", null);

                    // adding citiations amount
                    var xPathCitiations = $"//*[@id='gs_ccl_results']/div[{i}]/div[2]/div[3]/a[1]";
                    var citiationsCheck = doc.DocumentNode.SelectSingleNode(xPathCitiations).InnerText;
                    article.Citiations = citiationsCheck.StartsWith("Cited by") ? citiationsCheck : "No citiations for this article. ";

                    scholarArticles.Add(article);
                }
                else
                {
                    var xPathRefCheck = $"//*[@id='gs_ccl_results']/div[{i}]/div/h3/a";
                    var refCheck      = doc.DocumentNode.SelectSingleNode(xPathRefCheck);
                    if (refCheck != null)
                    {
                        var xPathTitle = $"//*[@id='gs_ccl_results']/div[{i}]/div/h3/a";
                        article.Title = doc.DocumentNode.SelectSingleNode(xPathTitle).InnerText;

                        var xPathInfo = $"//*[@id='gs_ccl_results']/div[{i}]/div/div[1]";
                        article.Info = doc.DocumentNode.SelectSingleNode(xPathInfo).InnerText;

                        article.Reference = refCheck.GetAttributeValue("href", null);

                        var xPathCitiations = $"//*[@id='gs_ccl_results']/div[{i}]/div/div[3]/a[1]";
                        var citiationsCheck = doc.DocumentNode.SelectSingleNode(xPathCitiations).InnerText;
                        article.Citiations = citiationsCheck.StartsWith("Cited by") ? citiationsCheck : "No citiations for this article. ";

                        scholarArticles.Add(article);
                    }

                    else
                    {
                        // case, when article do not has reference, but has a tag [citiation]/[book]
                        var xPathTitleCheck = $"//*[@id='gs_ccl_results']/div[{i}]/div/h3";
                        var xPathSpanNode   = $"//*[@id='gs_ccl_results']/div[{i}]/div/h3/span";
                        var titleMatchNode  = doc.DocumentNode.SelectSingleNode(xPathTitleCheck);
                        if (titleMatchNode != null)
                        {
                            var spanNode = doc.DocumentNode.SelectSingleNode(xPathSpanNode);
                            titleMatchNode.RemoveChild(spanNode);
                            article.Title = titleMatchNode.InnerText;

                            string xPathInfo = $"//*[@id='gs_ccl_results']/div[{i}]/div/div[1]";
                            article.Info = doc.DocumentNode.SelectSingleNode(xPathInfo).InnerText;

                            article.Reference = "This article does not have a reference";

                            var xPathCitiations = $"//*[@id='gs_ccl_results']/div[{i}]/div/div[2]/a[1]";
                            var citiationsCheck = doc.DocumentNode.SelectSingleNode(xPathCitiations).InnerText;
                            article.Citiations = citiationsCheck.StartsWith("Cited by") ? citiationsCheck : "No citiations for this article. ";

                            scholarArticles.Add(article);
                        }
                    }
                }
            }

            return(scholarArticles);
        }