Esempio n. 1
0
        public async Task LoadDetailsAsync(HttpClient client = null)
        {
            {
#endif
                var l = new LazyUri("https://m.facebook.com/photo.php?fbid=" + Id);
#if SHAMAN
                l.AppendFragmentParameter("$cookie-c_user", Blog.Configuration_FacebookUserId.ToString());
                l.AppendFragmentParameter("$cookie-xs", Blog.Configuration_FacebookXs);
                var page = await l.GetHtmlNodeAsync();
#else
                HtmlNode page;
                if (client != null)
                {
                    page = await l.Url.GetHtmlNodeAsync(new WebRequestOptions()
                    {
                        CustomHttpClient = client, AllowCachingEvenWithCustomRequestOptions = true
                    });
                }
                else
                {
                    page = await GetNodeAsync(l.Url);
                }
#endif
                var url = page.GetLinkUrl("a:text-is('View Full Size')");
#if SHAMAN
                LargestImage = WebImage.FromUrlUntracked(url);
#else
                LargestImage = WebFile.FromUrl(url);
#endif

                Date = Conversions.TryParseDateTime(page.TryGetValue("abbr"), null, false, null);

                /*
                 * var k = await ("https://graph.facebook.com/" + Id + "?fields=images,from,created_time,backdated_time&access_token=" + Utils.EscapeDataString(Blog.Configuration_FacebookUserAccessToken)).AsLazyUri().GetJsonAsync<JObject>();
                 *
                 * var img = ((JArray)k["images"]).MaxByOrDefault(x => ((JObject)x).Value<int>("height"));
                 * LargestImage = WebImage.FromUrl(img.Value<string>("source").AsUri());
                 * var backdated = img.Value<string>("backdated_time");
                 * var created = img.Value<string>("created_time");
                 *
                 * if (created != null) DateCreated = Conversions.ParseDateTime(created, null, null);
                 * if (backdated != null) DateBackdated = Conversions.ParseDateTime(backdated, null, null);
                 */
            }
        }
        // Supported formats:
        // a=1&b=c    (isUnprefixedExtraParameters)
        // §a=1&b=c
        // .link-next
        // .link-next§§preserve
        // .link-next (alwaysPreserveRemainingParameters)
        // .link-next§§preserve§§a={z}

        public static bool UpdateNextLink(ref LazyUri modifiableUrl, HtmlNode node, string rule, bool isUnprefixedExtraParameters = false, bool alwaysPreserveRemainingParameters = false)
        {
            var  anyVarying = false;
            bool preserve   = alwaysPreserveRemainingParameters;

            if (!isUnprefixedExtraParameters)
            {
                string additionalChanges = null;
                if (!rule.StartsWith("§"))
                {
                    if (rule.Contains("§§preserve"))
                    {
                        preserve = true;
                        rule     = rule.Replace("§§preserve", string.Empty);
                    }
                    if (rule.Contains("§§"))
                    {
                        additionalChanges = rule.CaptureAfter("§§");
                        rule = rule.CaptureBefore("§§");
                    }
                    var nextlink = node.FindSingle(rule);
                    if (nextlink == null)
                    {
                        modifiableUrl = null; return(false);
                    }

                    var url = nextlink.TryGetLinkUrl();
                    if (url == null)
                    {
                        url = nextlink?.TryGetValue()?.AsUri();
                    }
                    if (!HttpUtils.IsHttp(url))
                    {
                        modifiableUrl = null; return(false);
                    }
                    if (!string.IsNullOrEmpty(url.Fragment))
                    {
                        url = url.GetLeftPart_UriPartial_Query().AsUri();
                    }

                    var defaults = preserve ? modifiableUrl.QueryParameters.Concat(modifiableUrl.FragmentParameters).ToList() : null;
                    modifiableUrl = new LazyUri(url);
                    if (defaults != null)
                    {
                        foreach (var kv in defaults)
                        {
                            if (kv.Key.StartsWith("$json-query-") && modifiableUrl.GetQueryParameter(kv.Key.CaptureBetween("-query-", "-")) != null)
                            {
                                continue;
                            }
                            if (modifiableUrl.GetQueryParameter(kv.Key) == null && modifiableUrl.GetFragmentParameter(kv.Key) == null)
                            {
                                if (kv.Key.StartsWith("$"))
                                {
                                    modifiableUrl.AppendFragmentParameter(kv.Key, kv.Value);
                                }
                                else
                                {
                                    modifiableUrl.AppendQueryParameter(kv.Key, kv.Value);
                                }
                            }
                        }
                    }
                    anyVarying = true;
                    if (additionalChanges == null)
                    {
                        return(anyVarying);
                    }
                }

                if (additionalChanges != null)
                {
                    rule = additionalChanges;
                }
                else
                {
                    rule = rule.Substring(1);
                }
            }



            var z = HttpUtils.GetParameters(rule);

            foreach (var kv in z)
            {
                var val = kv.Value;
                var key = kv.Key;
                if (key.StartsWith("£"))
                {
                    key = "$" + key.Substring(1);
                }

                if (val == "{delete}")
                {
                    if (key.StartsWith("$"))
                    {
                        modifiableUrl.RemoveFragmentParameter(key);
                    }
                    else
                    {
                        modifiableUrl.RemoveQueryParameter(key);
                    }
                    continue;
                }
                if (val.StartsWith("{") && val.EndsWith("}"))
                {
                    val = val.Substring(1, val.Length - 2);
                    var optional       = false;
                    var leaveUnchanged = false;
                    if (val.StartsWith("optional:"))
                    {
                        optional = true; val = val.CaptureAfter(":");
                    }
                    if (val.StartsWith("unchanged:"))
                    {
                        leaveUnchanged = true; val = val.CaptureAfter(":");
                    }
                    var v = node.TryGetValue(val);
                    anyVarying = true;
                    if (v == null)
                    {
                        if (leaveUnchanged)
                        {
                            continue;
                        }
                        if (optional)
                        {
                            if (key.StartsWith("$"))
                            {
                                modifiableUrl.RemoveFragmentParameter(key);
                            }
                            else
                            {
                                modifiableUrl.RemoveQueryParameter(key);
                            }
                            continue;
                        }
                        modifiableUrl = null;
                        return(anyVarying);
                    }
                    val = v;
                }


                if (key.StartsWith("$"))
                {
                    modifiableUrl.AppendFragmentParameter(key, val);
                }
                else
                {
                    modifiableUrl.AppendQueryParameter(key, val);
                }
            }

            return(anyVarying);
        }
Esempio n. 3
0
        private static async Task <string> DownloadAttemptAsync(Uri url, bool force, IProgress <DataTransferProgress> progress)
        {
            var    originalUrl = url;
            string hash;

            using (var sha1 = new System.Security.Cryptography.SHA1Cng())
            {
                hash = ToHex(sha1.ComputeHash(Encoding.UTF8.GetBytes(url.AbsoluteUri))).ToLower().Substring(0, 16);
            }

            lock (typeof(Paper))
            {
                if (cachedPapers == null)
                {
                    cachedPapers = new Dictionary <string, string>();
                    foreach (var x in Directory.EnumerateFiles("/Awdee/SciHub", "*.pdf"))
                    {
                        var name = Path.GetFileName(x);
                        var p    = name.IndexOf('-');
                        cachedPapers[p == -1 ? x : name.Substring(0, p)] = x;
                    }
                }
            }

            var r = cachedPapers.TryGetValue(hash);

            if (r != null)
            {
                return(r);
            }
            progress.Report("Initializing");
            //  return;
            WebFile  pdfFile  = null;
            string   title    = null;
            HtmlNode original = null;
            var      cookies  = new IsolatedCookieContainer();


            string doi = null;

            if (url.IsHostedOn("dx.doi.org"))
            {
                doi = HttpUtils.UnescapeDataString(url.AbsolutePath.Substring(1));
            }


            (pdfFile, title) = await TryGetLibgenAsync(doi, null, progress);

            if (pdfFile == null && url.IsHostedOn("academia.edu"))
            {
                cookies.AddRange(HttpUtils.ParseCookies(Configuration_AcademiaEduCookie.Trim()));
                progress.Report("Retrieving from Academia.edu");
                original = await url.GetHtmlNodeAsync(null, cookies);

                var u = new LazyUri(original.GetLinkUrl("a[data-download],.js-swp-download-button"));
                u.AppendCookies(cookies);
                u.AppendFragmentParameter("$header-Referer", url.AbsoluteUri);
                title   = original.GetValue(":property('citation_title')");
                pdfFile = WebFile.FromUrlUntracked(u.Url);
            }

            if (pdfFile == null)
            {
                try
                {
                    progress.Report("Retrieving plain page");

                    original = await url.GetHtmlNodeAsync();

                    doi = original.TryGetValue(":property('citation_doi'),meta[scheme='doi']:property('dc.Identifier')");

                    if (doi == null && url.IsHostedOn("nih.gov"))
                    {
                        doi = original.TryGetValue("a[ref='aid_type=doi'],.doi > a");
                        if (doi == null && url.AbsolutePath.StartsWith("/pubmed/"))
                        {
                            progress.Report("Finding DOI on EuropePMC.org");
                            var alt = await HttpUtils.FormatEscaped("http://europepmc.org/abstract/med/{0}", url.GetPathComponent(1)).GetHtmlNodeAsync();

                            doi = alt.TryGetValue("meta[name='citation_doi']", "content");
                        }
                    }

                    if (doi == null && url.IsHostedOn("sciencedirect.com"))
                    {
                        doi = original.TryGetValue("script:json-token('SDM.doi = ')");
                    }

                    if (doi != null)
                    {
                        (pdfFile, title) = await TryGetLibgenAsync(doi, null, progress);
                    }

                    if (pdfFile == null && url.IsHostedOn("researchgate.net"))
                    {
                        var u = FindPdfLink(original);
                        if (u != null)
                        {
                            pdfFile = WebFile.FromUrlUntracked(u);
                        }
                    }

                    if (title == null)
                    {
                        title = original.TryGetValue(":property('citation_title')");
                        if (title == null)
                        {
                            title = original.TryGetValue("title")?.TrimEnd(" - PubMed - NCBI");
                        }
                    }
                }
                catch (NotSupportedResponseException ex) when(ex.ContentType == "application/pdf")
                {
                    pdfFile = WebFile.FromUrlUntracked(url);
                }
            }
            if (pdfFile == null)
            {
                if (url.IsHostedOn("nlm.nih.gov"))
                {
                    var a = original.TryGetLinkUrl(".portlet a");
                    if (a != null)
                    {
                        url = a;
                    }
                    else
                    {
                        var k = FindPdfLink(original);
                        if (k != null)
                        {
                            pdfFile = WebFile.FromUrlUntracked(k);
                        }
                    }
                }


                if (pdfFile == null)
                {
                    if (!url.IsHostedOn("scielo.br"))
                    {
                        var u = new LazyUri("http://" + url.Host + ".sci-hub.cc" + url.AbsolutePath + url.Query + url.Fragment);
                        progress.Report("Trying on SciHub");
                        u.AppendFragmentParameter("$allow-same-redirect", "1");
                        url = u.Url;
                    }
                    else
                    {
                        progress.Report("Trying on " + url.Host);
                    }

                    var scihub = await url.GetHtmlNodeAsync(null, cookies);

                    if (scihub.FindSingle("img#captcha") != null)
                    {
                        throw new CaptchaException(scihub.OwnerDocument.PageUrl);
                    }

                    if (scihub.OwnerDocument.PageUrl.IsHostedOn("libgen.io"))
                    {
                        var u = scihub.GetLinkUrl("a[href*='/ads.php?']");
                        progress.Report("Found on LibGen.IO");
                        (pdfFile, title) = await TryGetLibgenAsync(null, u, progress);
                    }
                    else
                    {
                        var pdflink = scihub.TryGetLinkUrl("iframe#pdf") ??
                                      FindPdfLink(scihub);
                        if (pdflink != null)
                        {
                            var u = new LazyUri(pdflink);
                            u.AppendCookies(cookies);
                            pdfFile = WebFile.FromUrlUntracked(u.Url);
                        }
                    }
                }
            }



            if (pdfFile != null)
            {
                var uu = new LazyUri(pdfFile.Url);
                uu.AppendFragmentParameter("$allow-same-redirect", "1");
                uu.AppendFragmentParameter("$forbid-html", "1");
                pdfFile = WebFile.FromUrlUntracked(uu.Url);
                if (title == null)
                {
                    var z = pdfFile.SuggestedFileName;
                    if (z != null)
                    {
                        title = Path.GetFileNameWithoutExtension(z);
                    }
                }
                else
                {
                    title = title.Trim().TrimEnd(".").RegexReplace(@"\s+", "-");
                }

                progress.Report("Downloading from " + pdfFile.Url.Host);
                string path;
                try
                {
                    path = await pdfFile.DownloadAsync("/Awdee/SciHub", hash + "-" + title + ".pdf", WebFile.FileOverwriteMode.Skip, CancellationToken.None, progress);
                }
                catch (NotSupportedResponseException ex)
                {
                    if (ex.Page != null && ex.Page.FindSingle("img#captcha") != null)
                    {
                        throw new CaptchaException(ex.Page.OwnerDocument.PageUrl);
                    }
                    throw;
                }
                var filename = Path.GetFileName(path);
                lock (typeof(Paper))
                {
                    cachedPapers[hash] = path;
                    File.AppendAllText("/Awdee/SciHubDownloads.csv", string.Join("\t", originalUrl, title, doi, filename, new FileInfo(path).Length) + "\r\n", Encoding.UTF8);
                }
                progress.Report("Done.");
                return(path);
            }
            throw new Exception("Could not find any PDF links.");
        }