public List <ScholarArticle> GetScholarArticlesByQuery(string query) { // getting page content from scholar page (with given query) query = GetQueryUrl(query); string pageContent = GetPageContent(query); // creating list of articles for "searhc on scholar" view List <ScholarArticle> scholarArticles = new List <ScholarArticle>(); // creating list of articles to operate List <Article> articles = new List <Article>(); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(pageContent); for (int i = 1; i <= 10; i++) { ScholarArticle article = new ScholarArticle(); string xPathBiblioCheck = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div[2]", i); //*[@id="gs_ccl_results_results"]/div[1] HtmlNode biblioCheck = doc.DocumentNode.SelectSingleNode(xPathBiblioCheck); if (biblioCheck != null) { string xPathRefCheck = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div[2]/h3/a", i); HtmlNode refCheck = doc.DocumentNode.SelectSingleNode(xPathRefCheck); // adding title of article string xPathTitle = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div[2]/h3/a", i); article.Title = doc.DocumentNode.SelectSingleNode(xPathTitle).InnerText; // adding info string xPathInfo = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div[2]/div[1]", i); article.Info = doc.DocumentNode.SelectSingleNode(xPathInfo).InnerText; // adding reference article.Reference = refCheck.GetAttributeValue("href", null); // adding citiations amount string xPathCitiations = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div[2]/div[3]/a[1]", i); string citiationsCheck = doc.DocumentNode.SelectSingleNode(xPathCitiations).InnerText; if (citiationsCheck.StartsWith("Cited by")) { article.Citiations = citiationsCheck; } else { article.Citiations = "No citiations for this article. "; } scholarArticles.Insert(i - 1, article); } else { string xPathRefCheck = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/h3/a", i); HtmlNode refCheck = doc.DocumentNode.SelectSingleNode(xPathRefCheck); if (refCheck != null) { string xPathTitle = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/h3/a", i); article.Title = doc.DocumentNode.SelectSingleNode(xPathTitle).InnerText; string xPathInfo = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/div[1]", i); article.Info = doc.DocumentNode.SelectSingleNode(xPathInfo).InnerText; article.Reference = refCheck.GetAttributeValue("href", null); string xPathCitiations = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/div[3]/a[1]", i); string citiationsCheck = doc.DocumentNode.SelectSingleNode(xPathCitiations).InnerText; if (citiationsCheck.StartsWith("Cited by")) { article.Citiations = citiationsCheck; } else { article.Citiations = "No citiations for this article. "; } scholarArticles.Insert(i - 1, article); } else { // case, when article do not has reference, but has a tag [citiation]/[book] string xPathTitleCheck = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/h3", i); string xPathSpanNode = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/h3/span", i); HtmlNode titleMatchNode = doc.DocumentNode.SelectSingleNode(xPathTitleCheck); HtmlNode spanNode = doc.DocumentNode.SelectSingleNode(xPathSpanNode); titleMatchNode.RemoveChild(spanNode); article.Title = titleMatchNode.InnerText; string xPathInfo = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/div[1]", i); article.Info = doc.DocumentNode.SelectSingleNode(xPathInfo).InnerText; article.Reference = "This article does not have a reference"; string xPathCitiations = string.Format("//*[@id='gs_ccl_results']/div[{0}]/div/div[2]/a[1]", i); string citiationsCheck = doc.DocumentNode.SelectSingleNode(xPathCitiations).InnerText; if (citiationsCheck.StartsWith("Cited by")) { article.Citiations = citiationsCheck; } else { article.Citiations = "No citiations for this article. "; } scholarArticles.Insert(i - 1, article); } } } return(scholarArticles); }
/// <summary> /// Main class method /// </summary> /// <param name="query"> String from text box </param> /// <param name="page"> Number of page to find </param> /// <param name="exactPhrase"> Article should contains this phrase </param> /// <param name="without"> Articles should not contains this words </param> /// <param name="head"> Is searching only in article head </param> /// <param name="published"> Journal, where the article was published </param> /// <param name="author"> Author of article </param> /// <param name="dateStart"> Since date </param> /// <param name="dateEnd"> Till date </param> /// <returns> List of articles from Google.Scholar </returns> public List <ScholarArticle> GetScholarArticlesByQuery(string query, int page, string exactPhrase = null, string without = null, bool head = false, string published = null, string author = null, int dateStart = int.MinValue, int dateEnd = int.MinValue) { // getting page content from scholar page (with given query) query = GetQueryUrl(query, page, exactPhrase, without, head, published, author, dateStart, dateEnd); var pageContent = GetPageContent(query); // creating list of articles for "search on scholar" view var scholarArticles = new List <ScholarArticle>(); var doc = new HtmlDocument(); doc.LoadHtml(pageContent); for (var i = 1; i <= 11; i++) { var article = new ScholarArticle(); var xPathBiblioCheck = $"//*[@id='gs_ccl_results']/div[{i}]/div[2]"; //*[@id="gs_ccl_results_results"]/div[1] var biblioCheck = doc.DocumentNode.SelectSingleNode(xPathBiblioCheck); if (biblioCheck != null) { var xPathRefCheck = $"//*[@id='gs_ccl_results']/div[{i}]/div[2]/h3/a"; var refCheck = doc.DocumentNode.SelectSingleNode(xPathRefCheck); // adding title of article var xPathTitle = $"//*[@id='gs_ccl_results']/div[{i}]/div[2]/h3/a"; article.Title = doc.DocumentNode.SelectSingleNode(xPathTitle).InnerText; // adding info var xPathInfo = $"//*[@id='gs_ccl_results']/div[{i}]/div[2]/div[1]"; article.Info = doc.DocumentNode.SelectSingleNode(xPathInfo).InnerText; // adding reference article.Reference = refCheck.GetAttributeValue("href", null); // adding citiations amount var xPathCitiations = $"//*[@id='gs_ccl_results']/div[{i}]/div[2]/div[3]/a[1]"; var citiationsCheck = doc.DocumentNode.SelectSingleNode(xPathCitiations).InnerText; article.Citiations = citiationsCheck.StartsWith("Cited by") ? citiationsCheck : "No citiations for this article. "; scholarArticles.Add(article); } else { var xPathRefCheck = $"//*[@id='gs_ccl_results']/div[{i}]/div/h3/a"; var refCheck = doc.DocumentNode.SelectSingleNode(xPathRefCheck); if (refCheck != null) { var xPathTitle = $"//*[@id='gs_ccl_results']/div[{i}]/div/h3/a"; article.Title = doc.DocumentNode.SelectSingleNode(xPathTitle).InnerText; var xPathInfo = $"//*[@id='gs_ccl_results']/div[{i}]/div/div[1]"; article.Info = doc.DocumentNode.SelectSingleNode(xPathInfo).InnerText; article.Reference = refCheck.GetAttributeValue("href", null); var xPathCitiations = $"//*[@id='gs_ccl_results']/div[{i}]/div/div[3]/a[1]"; var citiationsCheck = doc.DocumentNode.SelectSingleNode(xPathCitiations).InnerText; article.Citiations = citiationsCheck.StartsWith("Cited by") ? citiationsCheck : "No citiations for this article. "; scholarArticles.Add(article); } else { // case, when article do not has reference, but has a tag [citiation]/[book] var xPathTitleCheck = $"//*[@id='gs_ccl_results']/div[{i}]/div/h3"; var xPathSpanNode = $"//*[@id='gs_ccl_results']/div[{i}]/div/h3/span"; var titleMatchNode = doc.DocumentNode.SelectSingleNode(xPathTitleCheck); if (titleMatchNode != null) { var spanNode = doc.DocumentNode.SelectSingleNode(xPathSpanNode); titleMatchNode.RemoveChild(spanNode); article.Title = titleMatchNode.InnerText; string xPathInfo = $"//*[@id='gs_ccl_results']/div[{i}]/div/div[1]"; article.Info = doc.DocumentNode.SelectSingleNode(xPathInfo).InnerText; article.Reference = "This article does not have a reference"; var xPathCitiations = $"//*[@id='gs_ccl_results']/div[{i}]/div/div[2]/a[1]"; var citiationsCheck = doc.DocumentNode.SelectSingleNode(xPathCitiations).InnerText; article.Citiations = citiationsCheck.StartsWith("Cited by") ? citiationsCheck : "No citiations for this article. "; scholarArticles.Add(article); } } } } return(scholarArticles); }