Ejemplo n.º 1
0
 // RESULTS FROM AUTHOR PROFILE PAGE
 public SG.Author getAuthStatistics(string authUrl)
 {
     bool isOK = true;
     if (authUrl == null) return null;
     authUrl += "&pagesize=100";
     GSAuthScraper authScraper = new GSAuthScraper(authUrl, 0,ref isOK);
     if (!isOK) return null;
     authScraper.getCitationStats();
     SG.Author author = new SG.Author(authScraper.getName(), authScraper.getAffiliation(), authScraper.getHomePage(), authScraper.getHIndex(), authScraper.getIIndex());
     //Console.WriteLine(author.Name + "," + author.getHIndex() + "," + author.getI10Index());
     List<SG.Paper> papers = authScraper.getPapersOfCurrentPage();
     if (papers == null) return null;
     if (papers.Count == 0) return author;
     foreach (SG.Paper paper in papers) author.addPaper(paper);
     return author;
 }
Ejemplo n.º 2
0
        // SEARCH PAGE RESULTS
        public SG.Author getAuthors(string authName, string affiliation, string keywords, ref string next_url)
        {
            // CONNECTIONS

            SG.Author result = new SG.Author(authName);
            if (authName == null) authName = "";
            else
            {
                authName.Trim();
                authName = Regex.Replace(authName, @"\s+", "+");
                //authName = authName.Insert(0, "author:");
                //Console.WriteLine(authName);
            }
            if (affiliation == null) affiliation = "";
            else
            {
                affiliation = affiliation.Trim();
                affiliation = Regex.Replace(affiliation, @"\s+", "+");
            }
            if (keywords == null) keywords = "";
            else
            {
                keywords = keywords.Trim();
                keywords = Regex.Replace(keywords, @"\s+", "+");
            }

            //string url = "http://scholar.google.com/scholar?q=" + authName + "&btnG=&hl=en&as_sdt=1.";//keywords + "&as_epq=&as_oq=" //affiliation + "&as_eq=&as_occt=any&as_sauthors=" + authName;// + "&as_publication=&as_ylo=&as_yhi=&btnG=&hl=en&as_sdt=0%2C5";
            string url = "http://scholar.google.com/scholar?as_q=&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors="  + authName + "&as_publication=&as_ylo=&as_yhi=&btnG=&hl=en&as_sdt=0,5&num=20";

            Console.WriteLine(url);
            HtmlWeb web = new HtmlWeb();

            try
            {
                doc = web.Load(url);
            }
            catch (Exception e) {
                return null;
            }

            //Console.WriteLine(doc.DocumentNode.InnerHtml);

            string xpath = "//div[@class=\"gs_ri\"]";
            string title, titleLink, authors, publication, publisher, cited_by_url, summary;
            int year, rank = 1, no_of_citations;
            HtmlNodeCollection searchResults = doc.DocumentNode.SelectNodes(xpath);
            if (searchResults == null)
            {
                if (checkForCaptcha())
                {
                    Console.WriteLine("Captcha problem ...");
                    return null;
                }
                Console.WriteLine("No results ...");
                return result;
            }
            else
            {
                Console.WriteLine(url);
                foreach (HtmlNode n in searchResults)
                {

                    // TITLE AND TITLE LINK
                    HtmlNode child = n.SelectSingleNode(".//*[@class=\"gs_rt\"]");
                    title = child.InnerText;
                    titleLink = "";
                    HtmlNode url_node = child.SelectSingleNode(".//a");
                    if (url_node != null)
                    {
                        titleLink = url_node.GetAttributeValue("href", "");
                        if (!titleLink.Equals(""))
                        {
                            //titleLink = "http://scholar.google.com" + titleLink;
                            titleLink = titleLink.Replace("amp;", "");
                        }
                    }
                    // AUTHORS AND PUBLICATION
                    child = n.SelectSingleNode(".//*[@class=\"gs_a\"]");
                    authors = "";
                    publication = "";
                    publisher = "";
                    year = 1970;
                    if (child != null)
                    {
                        string[] names = child.InnerText.Split('-');
                        if (names.Length == 1) authors = names[0];
                        else if (names.Length == 2)
                        {
                            authors = names[0];
                            bool flag = false;
                            names[1].Trim();
                            try { year = Convert.ToInt32(names[1]); }
                            catch (FormatException fe) { flag = true; }
                            if (flag)
                            {
                                string[] p = names[1].Split(',');
                                try { year = Convert.ToInt32(p[1]); }
                                catch (Exception e) { }
                                publication = p[0];
                            }
                        }
                        else
                        {
                            authors = names[0];
                            publisher = names[2];
                            bool flag = false;
                            names[1].Trim();
                            try { year = Convert.ToInt32(names[1]); }
                            catch (FormatException fe) { flag = true; }
                            if (flag)
                            {
                                string[] p = names[1].Split(',');
                                try { year = Convert.ToInt32(p[1]); }
                                catch (Exception e) { }
                                publication = p[0];
                            }
                        }
                    }

                    // SUMMARY
                    child = n.SelectSingleNode(".//*[@class=\"gs_rs\"]");
                    summary = "";
                    if (child != null)
                    {
                        summary = child.InnerText;
                    }

                    // CITATION STUFF
                    no_of_citations = 0;
                    cited_by_url = "";
                    child = n.SelectSingleNode(".//*[@class=\"gs_fl\"]");
                    if (child != null) child = child.FirstChild;
                    if (child != null)
                    {
                        string text = child.InnerText;

                        try
                        {
                            text = text.Replace("Cited by", "");
                            text = text.Trim();
                            no_of_citations = Convert.ToInt32(text);
                        }
                        catch (Exception e) { }

                        cited_by_url = no_of_citations != 0 ? child.GetAttributeValue("href", "") : "";
                        if (!cited_by_url.Equals(""))
                        {
                            cited_by_url = "http://scholar.google.com" + cited_by_url;
                            cited_by_url = cited_by_url.Replace("amp;", "");
                        }

                    }

                    publisher.Trim();
                    publication.Trim();
                    SG.Paper paper = new SG.Paper(title, titleLink, authors, summary, year, publication, publisher, no_of_citations, cited_by_url, rank);
                    result.addPaper(paper);
                    rank++;
                }

                //NEXT PAGE URL
                HtmlNode bottom = doc.DocumentNode.SelectSingleNode(".//*[@id=\"gs_n\"]//table//td[@align=\"left\"]//a");
                if (bottom != null)
                {
                    url = bottom.GetAttributeValue("href", "");
                    if (!url.Equals(""))
                    {
                        url = "http://scholar.google.com" + url;
                        url = url.Replace("amp;", "");
                        next_url = url;
                    }
                    else next_url = null;
                }
                else next_url = null;

            }
            return result;
        }