Example #1
0
        //  JOURNAL SEARCH RESULTS
        public SG.Journal getJournals(string journalName, string ISSN, string keywords, ref string next_url)
        {
            // CONNECTIONS

            if (journalName == null) journalName = null;
            else journalName.Trim();
            SG.Journal result = new SG.Journal(journalName);

            // FORMATTIONG STRINGS
            if (ISSN == null) ISSN = "";
            else
            {
                ISSN = ISSN.Trim();
                ISSN = Regex.Replace(ISSN, @"\s+", "+");
            }
            if (keywords == null) keywords = "";
            else
            {
                keywords = keywords.Trim();
                keywords = Regex.Replace(keywords, @"\s+", "+");
            }
            string temp = (ISSN.Equals("") || keywords.Equals("")) ? keywords + ISSN : keywords + "+" + ISSN;

            string name = Regex.Replace(journalName, @"\s+", "+");
            string url = "http://scholar.google.com/scholar?as_q=&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=&as_publication=" + journalName + "&as_ylo=&as_yhi=&btnG=&hl=en&as_sdt=0,5&num=20";
            //string url = "http://scholar.google.co.in/scholar?hl=en&q=anil+kumar&btnG=&as_sdt=1,5&as_sdtp=";
            Console.WriteLine(url);

            //Console.WriteLine("loaded !!!");

            HtmlWeb web = new HtmlWeb();

            try
            {
                doc = web.Load(url);
            }
            catch (Exception e)
            {
                return null;
            }

            //Console.WriteLine(doc.DocumentNode.InnerHtml);
            //string html = client.DownloadString(url);
            //doc.LoadHtml(html);

            string xpath = "//div[@class=\"gs_ri\"]";
            string title, titleLink, authors, publication, publisher, cited_by_url, summary;
            int year, rank = 1, no_of_citations;
            HtmlNodeCollection searchResults = doc.DocumentNode.SelectNodes(xpath);
            if (searchResults == null)
            {
                if (checkForCaptcha())
                {
                    Console.WriteLine("Captcha problem ...");
                    return null;
                }
                Console.WriteLine("No results ...");
                return result;
            }
            else
            {
                foreach (HtmlNode n in searchResults)
                {

                    // TITLE AND TITLE LINK
                    HtmlNode child = n.SelectSingleNode(".//*[@class=\"gs_rt\"]");
                    title = child.InnerText;
                    titleLink = "";
                    HtmlNode url_node = child.SelectSingleNode(".//a");
                    if (url_node != null)
                    {
                        titleLink = url_node.GetAttributeValue("href", "");
                        if (!titleLink.Equals(""))
                        {
                            //titleLink = "http://scholar.google.com" + titleLink;
                            titleLink = titleLink.Replace("amp;", "");
                        }
                    }

                    // AUTHORS AND PUBLICATION
                    child = n.SelectSingleNode(".//*[@class=\"gs_a\"]");
                    authors = "";
                    publication = "";
                    publisher = "";
                    year = -1;
                    if (child != null)
                    {
                        string[] names = child.InnerText.Split('-');
                        if (names.Length == 1) authors = names[0];
                        else if (names.Length == 2)
                        {
                            authors = names[0];
                            bool flag = false;
                            names[1].Trim();
                            try { year = Convert.ToInt32(names[1]); }
                            catch (FormatException fe) { flag = true; }
                            if (flag)
                            {
                                string[] p = names[1].Split(',');
                                try { year = Convert.ToInt32(p[1]); }
                                catch (Exception e) { }
                                publication = p[0];
                            }
                        }
                        else
                        {
                            authors = names[0];
                            publisher = names[2];
                            bool flag = false;
                            names[1].Trim();
                            try { year = Convert.ToInt32(names[1]); }
                            catch (FormatException fe) { flag = true; }
                            if (flag)
                            {
                                string[] p = names[1].Split(',');
                                try { year = Convert.ToInt32(p[1]); }
                                catch (Exception e) { }
                                publication = p[0];
                            }
                        }
                    }

                    // SUMMARY
                    child = n.SelectSingleNode(".//*[@class=\"gs_rs\"]");
                    summary = "";
                    if (child != null)
                    {
                        summary = child.InnerText;
                    }

                    // CITATION STUFF
                    no_of_citations = 0;
                    cited_by_url = "";
                    child = n.SelectSingleNode(".//*[@class=\"gs_fl\"]");
                    if (child != null) child = child.FirstChild;
                    if (child != null)
                    {

                        string text = child.InnerText;

                        try
                        {
                            text = text.Replace("Cited by", "");
                            text = text.Trim();
                            no_of_citations = Convert.ToInt32(text);
                        }
                        catch (Exception e) { }

                        cited_by_url = no_of_citations != 0 ? child.GetAttributeValue("href", "") : "";
                        if (!cited_by_url.Equals(""))
                        {
                            cited_by_url = "http://scholar.google.com" + cited_by_url;
                            cited_by_url = cited_by_url.Replace("amp;", "");
                        }

                    }

                    SG.Paper paper = new SG.Paper(title, titleLink, authors, summary, year, publication, publisher, no_of_citations, cited_by_url, rank);
                    result.addPaper(paper);
                    rank++;
                }

                //NEXT PAGE URL
                HtmlNode bottom = doc.DocumentNode.SelectSingleNode(".//*[@id=\"gs_n\"]//table//td[@align=\"left\"]//a");
                if (bottom != null)
                {
                    url = bottom.GetAttributeValue("href", "");
                    if (!url.Equals(""))
                    {
                        url = "http://scholar.google.com" + url;
                        url = url.Replace("amp;", "");
                        next_url = url;
                    }
                    else next_url = null;
                }
                else next_url = null;
            }
            return result;
        }
Example #2
0
        // GET CITATIONS NEXT PAGE
        public bool? getCitationsNextPage(string url, ref string next_url, ref List<SG.Paper> papers)
        {
            setts = setRecords.ReadSettings();
            int maxResults = setts.GSMaxResults;
            int num = papers.Count;
            if (num >= maxResults) return false;

            // CONNECTIONS
            if (url == null) return false;
            HtmlWeb web = new HtmlWeb();

            try
            {
                doc = web.Load(url);
            }
            catch (Exception e)
            {
                return null;
            }

            List<SG.Paper> results = new List<SG.Paper>();

            string xpath = "//div[@class=\"gs_ri\"]";
            string title, titleLink, authors, publication, publisher, cited_by_url, summary;
            int year, rank = 1, no_of_citations;
            HtmlNodeCollection searchResults = doc.DocumentNode.SelectNodes(xpath);
            if (searchResults == null)
            {
                if (checkForCaptcha())
                {
                    Console.WriteLine("Captcha problem ...");
                    return null;
                }
                Console.WriteLine("No results ...");
                return false;
            }
            else
            {

                foreach (HtmlNode n in searchResults)
                {

                    // TITLE AND TITLE LINK
                    HtmlNode child = n.SelectSingleNode(".//*[@class=\"gs_rt\"]");
                    title = child.InnerText;
                    titleLink = "";
                    HtmlNode url_node = child.SelectSingleNode(".//a");
                    if (url_node != null)
                    {
                        titleLink = url_node.GetAttributeValue("href", "");
                        if (!titleLink.Equals(""))
                        {
                            //titleLink = "http://scholar.google.com" + titleLink;
                            titleLink = titleLink.Replace("amp;", "");
                        }
                    }

                    // AUTHORS AND PUBLICATION
                    child = n.SelectSingleNode(".//*[@class=\"gs_a\"]");
                    authors = "";
                    publication = "";
                    publisher = "";
                    year = 1970;
                    if (child != null)
                    {
                        string[] names = child.InnerText.Split('-');
                        if (names.Length == 1) authors = names[0];
                        else if (names.Length == 2)
                        {
                            authors = names[0];
                            bool flag = false;
                            names[1].Trim();
                            try { year = Convert.ToInt32(names[1]); }
                            catch (FormatException fe) { flag = true; }
                            if (flag)
                            {
                                string[] p = names[1].Split(',');
                                try { year = Convert.ToInt32(p[1]); }
                                catch (Exception e) { }
                                publication = p[0];
                            }
                        }
                        else
                        {
                            authors = names[0];
                            publisher = names[2];
                            bool flag = false;
                            names[1].Trim();
                            try { year = Convert.ToInt32(names[1]); }
                            catch (FormatException fe) { flag = true; }
                            if (flag)
                            {
                                string[] p = names[1].Split(',');
                                try { year = Convert.ToInt32(p[1]); }
                                catch (Exception e) { }
                                publication = p[0];
                            }
                        }
                    }

                    // SUMMARY
                    child = n.SelectSingleNode(".//*[@class=\"gs_rs\"]");
                    summary = "";
                    if (child != null)
                    {
                        summary = child.InnerText;
                    }

                    // CITATION STUFF
                    no_of_citations = 0;
                    cited_by_url = "";
                    child = n.SelectSingleNode(".//*[@class=\"gs_fl\"]");
                    if (child != null) child = child.FirstChild;
                    if (child != null)
                    {
                        string text = child.InnerText;
                        cited_by_url = child.GetAttributeValue("href", "");
                        if (!cited_by_url.Equals(""))
                        {
                            cited_by_url = "http://scholar.google.com" + cited_by_url;
                            cited_by_url = cited_by_url.Replace("amp;", "");
                        }
                        try
                        {
                            text = text.Replace("Cited by", "");
                            text = text.Trim();
                            no_of_citations = Convert.ToInt32(text);
                        }
                        catch (Exception e) { }
                    }

                    if (num == maxResults) return false;
                    SG.Paper paper = new SG.Paper(title, titleLink, authors, summary, year, publication, publisher, no_of_citations, cited_by_url, rank);
                    results.Add(paper);
                    num++;
                    rank++;
                }

                //NEXT PAGE URL
                HtmlNode bottom = doc.DocumentNode.SelectSingleNode(".//*[@id=\"gs_n\"]//table//td[@align=\"left\"]//a");
                if (bottom != null)
                {
                    url = bottom.GetAttributeValue("href", "");
                    if (!url.Equals(""))
                    {
                        url = "http://scholar.google.com" + url;
                        url = url.Replace("amp;", "");
                        next_url = url;
                    }
                    else next_url = null;
                }
                else next_url = null;
            }

            foreach (SG.Paper p in results)
                papers.Add(p);
            return true;
        }