예제 #1
0
 public void AppendCookies(IsolatedCookieContainer cookies)
 {
     AppendCookies(cookies.Cookies);
 }
예제 #2
0
        private static async Task <string> DownloadAttemptAsync(Uri url, bool force, IProgress <DataTransferProgress> progress)
        {
            var    originalUrl = url;
            string hash;

            using (var sha1 = new System.Security.Cryptography.SHA1Cng())
            {
                hash = ToHex(sha1.ComputeHash(Encoding.UTF8.GetBytes(url.AbsoluteUri))).ToLower().Substring(0, 16);
            }

            lock (typeof(Paper))
            {
                if (cachedPapers == null)
                {
                    cachedPapers = new Dictionary <string, string>();
                    foreach (var x in Directory.EnumerateFiles("/Awdee/SciHub", "*.pdf"))
                    {
                        var name = Path.GetFileName(x);
                        var p    = name.IndexOf('-');
                        cachedPapers[p == -1 ? x : name.Substring(0, p)] = x;
                    }
                }
            }

            var r = cachedPapers.TryGetValue(hash);

            if (r != null)
            {
                return(r);
            }
            progress.Report("Initializing");
            //  return;
            WebFile  pdfFile  = null;
            string   title    = null;
            HtmlNode original = null;
            var      cookies  = new IsolatedCookieContainer();


            string doi = null;

            if (url.IsHostedOn("dx.doi.org"))
            {
                doi = HttpUtils.UnescapeDataString(url.AbsolutePath.Substring(1));
            }


            (pdfFile, title) = await TryGetLibgenAsync(doi, null, progress);

            if (pdfFile == null && url.IsHostedOn("academia.edu"))
            {
                cookies.AddRange(HttpUtils.ParseCookies(Configuration_AcademiaEduCookie.Trim()));
                progress.Report("Retrieving from Academia.edu");
                original = await url.GetHtmlNodeAsync(null, cookies);

                var u = new LazyUri(original.GetLinkUrl("a[data-download],.js-swp-download-button"));
                u.AppendCookies(cookies);
                u.AppendFragmentParameter("$header-Referer", url.AbsoluteUri);
                title   = original.GetValue(":property('citation_title')");
                pdfFile = WebFile.FromUrlUntracked(u.Url);
            }

            if (pdfFile == null)
            {
                try
                {
                    progress.Report("Retrieving plain page");

                    original = await url.GetHtmlNodeAsync();

                    doi = original.TryGetValue(":property('citation_doi'),meta[scheme='doi']:property('dc.Identifier')");

                    if (doi == null && url.IsHostedOn("nih.gov"))
                    {
                        doi = original.TryGetValue("a[ref='aid_type=doi'],.doi > a");
                        if (doi == null && url.AbsolutePath.StartsWith("/pubmed/"))
                        {
                            progress.Report("Finding DOI on EuropePMC.org");
                            var alt = await HttpUtils.FormatEscaped("http://europepmc.org/abstract/med/{0}", url.GetPathComponent(1)).GetHtmlNodeAsync();

                            doi = alt.TryGetValue("meta[name='citation_doi']", "content");
                        }
                    }

                    if (doi == null && url.IsHostedOn("sciencedirect.com"))
                    {
                        doi = original.TryGetValue("script:json-token('SDM.doi = ')");
                    }

                    if (doi != null)
                    {
                        (pdfFile, title) = await TryGetLibgenAsync(doi, null, progress);
                    }

                    if (pdfFile == null && url.IsHostedOn("researchgate.net"))
                    {
                        var u = FindPdfLink(original);
                        if (u != null)
                        {
                            pdfFile = WebFile.FromUrlUntracked(u);
                        }
                    }

                    if (title == null)
                    {
                        title = original.TryGetValue(":property('citation_title')");
                        if (title == null)
                        {
                            title = original.TryGetValue("title")?.TrimEnd(" - PubMed - NCBI");
                        }
                    }
                }
                catch (NotSupportedResponseException ex) when(ex.ContentType == "application/pdf")
                {
                    pdfFile = WebFile.FromUrlUntracked(url);
                }
            }
            if (pdfFile == null)
            {
                if (url.IsHostedOn("nlm.nih.gov"))
                {
                    var a = original.TryGetLinkUrl(".portlet a");
                    if (a != null)
                    {
                        url = a;
                    }
                    else
                    {
                        var k = FindPdfLink(original);
                        if (k != null)
                        {
                            pdfFile = WebFile.FromUrlUntracked(k);
                        }
                    }
                }


                if (pdfFile == null)
                {
                    if (!url.IsHostedOn("scielo.br"))
                    {
                        var u = new LazyUri("http://" + url.Host + ".sci-hub.cc" + url.AbsolutePath + url.Query + url.Fragment);
                        progress.Report("Trying on SciHub");
                        u.AppendFragmentParameter("$allow-same-redirect", "1");
                        url = u.Url;
                    }
                    else
                    {
                        progress.Report("Trying on " + url.Host);
                    }

                    var scihub = await url.GetHtmlNodeAsync(null, cookies);

                    if (scihub.FindSingle("img#captcha") != null)
                    {
                        throw new CaptchaException(scihub.OwnerDocument.PageUrl);
                    }

                    if (scihub.OwnerDocument.PageUrl.IsHostedOn("libgen.io"))
                    {
                        var u = scihub.GetLinkUrl("a[href*='/ads.php?']");
                        progress.Report("Found on LibGen.IO");
                        (pdfFile, title) = await TryGetLibgenAsync(null, u, progress);
                    }
                    else
                    {
                        var pdflink = scihub.TryGetLinkUrl("iframe#pdf") ??
                                      FindPdfLink(scihub);
                        if (pdflink != null)
                        {
                            var u = new LazyUri(pdflink);
                            u.AppendCookies(cookies);
                            pdfFile = WebFile.FromUrlUntracked(u.Url);
                        }
                    }
                }
            }



            if (pdfFile != null)
            {
                var uu = new LazyUri(pdfFile.Url);
                uu.AppendFragmentParameter("$allow-same-redirect", "1");
                uu.AppendFragmentParameter("$forbid-html", "1");
                pdfFile = WebFile.FromUrlUntracked(uu.Url);
                if (title == null)
                {
                    var z = pdfFile.SuggestedFileName;
                    if (z != null)
                    {
                        title = Path.GetFileNameWithoutExtension(z);
                    }
                }
                else
                {
                    title = title.Trim().TrimEnd(".").RegexReplace(@"\s+", "-");
                }

                progress.Report("Downloading from " + pdfFile.Url.Host);
                string path;
                try
                {
                    path = await pdfFile.DownloadAsync("/Awdee/SciHub", hash + "-" + title + ".pdf", WebFile.FileOverwriteMode.Skip, CancellationToken.None, progress);
                }
                catch (NotSupportedResponseException ex)
                {
                    if (ex.Page != null && ex.Page.FindSingle("img#captcha") != null)
                    {
                        throw new CaptchaException(ex.Page.OwnerDocument.PageUrl);
                    }
                    throw;
                }
                var filename = Path.GetFileName(path);
                lock (typeof(Paper))
                {
                    cachedPapers[hash] = path;
                    File.AppendAllText("/Awdee/SciHubDownloads.csv", string.Join("\t", originalUrl, title, doi, filename, new FileInfo(path).Length) + "\r\n", Encoding.UTF8);
                }
                progress.Report("Done.");
                return(path);
            }
            throw new Exception("Could not find any PDF links.");
        }