private void LoadInitialQueryParameters() { if (queryParameters == null) { if (parsedUnparsedOutOfSync) { var q = unparsedUrl.IndexOf('?'); if (q == -1) { queryParameters = new List <KeyValuePair <string, string> >(); } else { queryParameters = HttpUtils.GetParameters(unparsedUrl.Substring(q)).ToList(); } } else { queryParameters = url.GetQueryParameters().ToList(); } nextQueryParameterToAdd = queryParameters.Count; } }
public string GetCookie(string name) { if (LastCookies == null) { return(null); } return(HttpUtils.GetParameters(LastCookies).FirstOrDefault(x => x.Key == name).Value); }
internal string GetSessionCookie() { return(string.Join(";", HttpUtils.GetParameters(LastCookies).Where(x => HttpUtils.Configuration_SessionCookieNames.Contains(x.Key.ToLowerFast())).Select(x => x.Value) #if NET35 .ToArray() #endif )); }
public override bool Equals(object obj) { var other = obj as WebFile; if (other == null) { return(false); } return(HttpUtils.UrisEqual(other.Url, this.Url)); }
internal string GetUrlStringIfNew() { NakedStringBuilder sb = null; string fragmentsToReapply = null; LoadInitialQueryParameters(); LoadInitialFragmentParameters(); if (queryParameters != null && nextQueryParameterToAdd != queryParameters.Count) { if (unparsedUrl != null) { url = unparsedUrl.AsUri(); unparsedUrl = null; } fragmentsToReapply = this.url != null ? this.url.Fragment : null; var initial = !string.IsNullOrEmpty(fragmentsToReapply) ? url.GetLeftPart_UriPartial_Query() : url.AbsoluteUri; sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); sb.Append(initial); HttpUtils.AppendParameters(queryParameters.Skip(nextQueryParameterToAdd), sb, '?'); nextQueryParameterToAdd = queryParameters.Count; } if (fragmentParameters != null && nextFragmentParameterToAdd != fragmentParameters.Count) { if (unparsedUrl != null) { url = unparsedUrl.AsUri(); unparsedUrl = null; } if (!string.IsNullOrEmpty(fragmentsToReapply)) { sb.Append(fragmentsToReapply); } else if (sb == null) { var initial = url.AbsoluteUri; sb = ReseekableStringBuilder.AcquirePooledStringBuilder(); sb.Append(initial); } HttpUtils.AppendParameters(fragmentParameters.Skip(nextFragmentParameterToAdd), sb, '#'); nextFragmentParameterToAdd = fragmentParameters.Count; } else if (fragmentsToReapply != null && sb != null) { sb.Append(fragmentsToReapply); } return(sb != null?ReseekableStringBuilder.GetValueAndRelease(sb) : null); }
private static Uri FindPdfLink(HtmlNode scihub) { var options = scihub.DescendantsAndSelf("a") .Select(x => x.TryGetLinkUrl()) .Where(x => x != null && HttpUtils.IsHttp(x) && x.AbsolutePath.EndsWith(".pdf")) .GroupBy(x => x) .ToList(); if (options.Count <= 1) { return(options.FirstOrDefault()?.Key); } else { return(options.OrderByDescending(x => x.Key.AbsoluteUri.Length).First().Key); } }
private async static Task <(WebFile, string)> TryGetLibgenAsync(string doi, Uri libgenurl, IProgress <DataTransferProgress> progress) { if (doi != null || libgenurl != null) { libgenurl = libgenurl ?? HttpUtils.FormatEscaped("http://libgen.io/scimag/ads.php?doi={0}&downloadname=", doi); progress.Report("Trying on LibGen.IO"); var page = await libgenurl.GetHtmlNodeAsync(); var citation = page.GetValue("textarea"); var title = citation.TryCaptureBetween("title = {", "}"); if (title != null) { progress.Report("Found on LibGen.IO"); return(WebFile.FromUrlUntracked(page.GetLinkUrl("h2:text-is('GET'):select-parent")), title); } } return(null, null); }
internal void SaveResponseInfo(HttpResponseMessage partialDownload, bool continueDownload) { if (partialDownload != null) { #if NET35 var len = partialDownload.ContentLength == -1 ? null : (long?)partialDownload.ContentLength; #else var len = partialDownload.Content.Headers.ContentLength; #endif if (len != null) { Size = new FileSize(len.Value); } // HACK: ignore content disposition in .net 35 #if !NET35 var contentDisposition = partialDownload.Content.Headers.ContentDisposition; if (contentDisposition != null) { contentDispositionFileName = contentDisposition.FileName; } #endif contentTypeExtension = HttpUtils.GetFileExtensionFromMime( #if NET35 HttpUtils.GetMimeFromContentType(partialDownload.Headers["Content-Type"]) #else partialDownload.Content.Headers.ContentType?.MediaType #endif ); if (continueDownload && (manager == null || !manager.IsAlive)) { this.partialDownload = partialDownload; manager = new MediaStreamManager(GetResponseAsync, true); } else { partialDownload.AbortAndDispose(); } OnChanged(); } }
private void LoadInitialFragmentParameters() { if (fragmentParameters == null) { if (parsedUnparsedOutOfSync) { var hash = unparsedUrl.IndexOf('#'); if (hash == -1) { fragmentParameters = new List <KeyValuePair <string, string> >(); } else { fragmentParameters = HttpUtils.GetParameters(unparsedUrl.Substring(hash)).ToList(); } } else { fragmentParameters = url.GetFragmentParameters().ToList(); } nextFragmentParameterToAdd = fragmentParameters.Count; } }
// Supported formats: // a=1&b=c (isUnprefixedExtraParameters) // §a=1&b=c // .link-next // .link-next§§preserve // .link-next (alwaysPreserveRemainingParameters) // .link-next§§preserve§§a={z} public static bool UpdateNextLink(ref LazyUri modifiableUrl, HtmlNode node, string rule, bool isUnprefixedExtraParameters = false, bool alwaysPreserveRemainingParameters = false) { var anyVarying = false; bool preserve = alwaysPreserveRemainingParameters; if (!isUnprefixedExtraParameters) { string additionalChanges = null; if (!rule.StartsWith("§")) { if (rule.Contains("§§preserve")) { preserve = true; rule = rule.Replace("§§preserve", string.Empty); } if (rule.Contains("§§")) { additionalChanges = rule.CaptureAfter("§§"); rule = rule.CaptureBefore("§§"); } var nextlink = node.FindSingle(rule); if (nextlink == null) { modifiableUrl = null; return(false); } var url = nextlink.TryGetLinkUrl(); if (url == null) { url = nextlink?.TryGetValue()?.AsUri(); } if (!HttpUtils.IsHttp(url)) { modifiableUrl = null; return(false); } if (!string.IsNullOrEmpty(url.Fragment)) { url = url.GetLeftPart_UriPartial_Query().AsUri(); } var defaults = preserve ? modifiableUrl.QueryParameters.Concat(modifiableUrl.FragmentParameters).ToList() : null; modifiableUrl = new LazyUri(url); if (defaults != null) { foreach (var kv in defaults) { if (kv.Key.StartsWith("$json-query-") && modifiableUrl.GetQueryParameter(kv.Key.CaptureBetween("-query-", "-")) != null) { continue; } if (modifiableUrl.GetQueryParameter(kv.Key) == null && modifiableUrl.GetFragmentParameter(kv.Key) == null) { if (kv.Key.StartsWith("$")) { modifiableUrl.AppendFragmentParameter(kv.Key, kv.Value); } else { modifiableUrl.AppendQueryParameter(kv.Key, kv.Value); } } } } anyVarying = true; if (additionalChanges == null) { return(anyVarying); } } if (additionalChanges != null) { rule = additionalChanges; } else { rule = rule.Substring(1); } } var z = HttpUtils.GetParameters(rule); foreach (var kv in z) { var val = kv.Value; var key = kv.Key; if (key.StartsWith("£")) { key = "$" + key.Substring(1); } if (val == "{delete}") { if (key.StartsWith("$")) { modifiableUrl.RemoveFragmentParameter(key); } else { modifiableUrl.RemoveQueryParameter(key); } continue; } if (val.StartsWith("{") && val.EndsWith("}")) { val = val.Substring(1, val.Length - 2); var optional = false; var leaveUnchanged = false; if (val.StartsWith("optional:")) { optional = true; val = val.CaptureAfter(":"); } if (val.StartsWith("unchanged:")) { leaveUnchanged = true; val = val.CaptureAfter(":"); } var v = node.TryGetValue(val); anyVarying = true; if (v == null) { if (leaveUnchanged) { continue; } if (optional) { if (key.StartsWith("$")) { modifiableUrl.RemoveFragmentParameter(key); } else { modifiableUrl.RemoveQueryParameter(key); } continue; } modifiableUrl = null; return(anyVarying); } val = v; } if (key.StartsWith("$")) { modifiableUrl.AppendFragmentParameter(key, val); } else { modifiableUrl.AppendQueryParameter(key, val); } } return(anyVarying); }
internal string GetCachePath() { if (CacheVaryKey == null) { return(null); } return(Caching.GetWebCachePath(new LazyUri("http://shaman-cookies/?id=" + HttpUtils.EscapeDataString(CacheVaryKey)), false, true)); }
internal static async Task <HttpResponseInfo> SendAsync(this LazyUri url, WebRequestOptions options, HttpRequestMessageBox messageBox, bool alwaysCatchAndForbidRedirects = false, bool keepResponseAliveOnError = false, bool synchronous = false) { HttpUtils.EnsureInitialized(); if (!synchronous) { await Utils.CheckLocalFileAccessAsync(url); } Utils.RaiseWebRequestEvent(url, false); HttpResponseMessage result = null; LazyUri previousResponse2 = null; try { if (options == WebRequestOptions.DefaultOptions) { throw new ArgumentException(); } if (options.WaitBefore.Ticks != 0 && !synchronous) { await TaskEx.Delay(options.WaitBefore); } LazyUri previousResponse1 = null; previousResponse2 = url.Clone(); previousResponse2 = MaybeAddAdditionalQueryParameters(previousResponse2, options); var redirectIndex = 0; while (true) { #if WEBCLIENT HttpContent requestContent = null; #endif var message = messageBox?.PrebuiltRequest ?? CreateRequestInternal(previousResponse2, options, true, redirectIndex #if WEBCLIENT , out requestContent #endif ); if (messageBox != null) { messageBox.Dispose(); messageBox.Message = message; } #if WEBCLIENT if (requestContent != null) { if (requestContent.ContentType != null) { message.ContentType = requestContent.ContentType; } if (requestContent.ContentDisposition != null) { message.Headers["Content-Disposition"] = requestContent.ContentDisposition; } using (var req = await message.GetRequestStreamAsync()) { await requestContent.CopyToAsync(req); } } result = (HttpWebResponse)await message.GetResponseAsync(); #else message.Properties["ShamanURL"] = url; if (options.CustomHttpClient != null) { result = await options.CustomHttpClient.SendAsync(message, HttpCompletionOption.ResponseHeadersRead); } else { if (defaultHttpClient == null) { defaultHttpClient = CreateHttpClient(); } result = messageBox?.PrebuiltResponse ?? await defaultHttpClient.SendAsync(message, HttpCompletionOption.ResponseHeadersRead); } #endif #if !WEBCLIENT if (result.Content != null && result.Content.Headers.ContentType != null && result.Content.Headers.ContentType.CharSet == "utf8") { result.Content.Headers.ContentType.CharSet = "utf-8"; } #endif if ((int)result.StatusCode >= 400) { if (!keepResponseAliveOnError) { result.Dispose(); } // Hackish, change purpose of enumeration type throw new WebException("The web server returned: " + result.StatusCode.ToString(), (WebExceptionStatus)result.StatusCode); } #if WEBCLIENT var zz = result.Headers["Location"]; var redirectUrlNative = zz != null?HttpUtils.GetAbsoluteUri(url.PathConsistentUrl, zz) : null; #else var redirectUrlNative = result.Headers.Location; #endif if (redirectUrlNative == null) { return(new HttpResponseInfo() { RespondingUrl = previousResponse2, Response = result }); } else { if (alwaysCatchAndForbidRedirects) { return new HttpResponseInfo() { Response = result, RespondingUrl = previousResponse2, Exception = new WebException("Unexpected redirect", HttpUtils.Error_UnexpectedRedirect) } } ; result.Dispose(); var redirectUrl = new LazyUri(redirectUrlNative); if (!redirectUrl.IsAbsoluteUri) { redirectUrl = new LazyUri(new Uri(previousResponse2.PathConsistentUrl, redirectUrlNative)); } if (options != null && !options.AllowRedirects) { throw new WebException("Unexpected redirect was received.", HttpUtils.Error_UnexpectedRedirect); } if (redirectIndex == Configuration_MaximumNumberOfRedirects) { throw new WebException("The maximum number of redirects has been reached.", HttpUtils.Error_MaximumNumberOfRedirectsExceeded); } if (!(redirectIndex == 0 && options != null && (options.PostData != null || options.PostString != null))) { if (( (previousResponse1 != null && HttpUtils.UrisEqual(redirectUrl.PathAndQueryConsistentUrl, previousResponse1.PathAndQueryConsistentUrl)) || HttpUtils.UrisEqual(redirectUrl, previousResponse2))) { if (url.GetFragmentParameter("$allow-same-redirect") == "1") { if (!synchronous) { #if NET35 await TaskEx.Delay(Configuration_SameRedirectDelayTimeMs); #else await Task.Delay(Configuration_SameRedirectDelayTimeMs); #endif } } else { throw new WebException("The server isn't redirecting the requested resource properly.", HttpUtils.Error_RedirectLoopDetected); } } } previousResponse1 = previousResponse2; previousResponse2 = redirectUrl; redirectIndex++; } } } catch (Exception ex) { var orig = ex; #if !WEBCLIENT var hre = ex as HttpRequestException; if (hre != null && hre.InnerException != null) { ex = hre.InnerException; } #endif if (alwaysCatchAndForbidRedirects) { return new HttpResponseInfo() { Exception = ex, Response = result, RespondingUrl = previousResponse2 } } ; else if (ex == orig) { throw; } else { throw ex.Rethrow(); } } }
internal void SaveResponseInfo(HttpResponseMessage partialDownload, bool?continueDownload) { if (partialDownload != null) { #if NET35 var len = partialDownload.ContentLength == -1 ? null : (long?)partialDownload.ContentLength; #else var len = partialDownload.Content.Headers.ContentLength; #endif if (len != null) { Size = new FileSize(len.Value); } #if !WEBCLIENT var lastModified = partialDownload.Content.Headers.LastModified?.UtcDateTime; if (partialDownload.Headers.TryGetValues("Memento-Datetime", out var k)) { if (DateTimeOffset.TryParse(k.First(), CultureInfo.InvariantCulture, DateTimeStyles.None, out var m)) { if (lastModified == null || m.UtcDateTime < lastModified) { lastModified = m.UtcDateTime; } } } if (lastModified != null) { LastModified = lastModified; } #endif // HACK: ignore content disposition in .net 35 #if !NET35 var contentDisposition = partialDownload.Content.Headers.ContentDisposition; if (contentDisposition != null) { contentDispositionFileName = contentDisposition.FileName; } #endif contentTypeExtension = HttpUtils.GetFileExtensionFromMime( #if NET35 HttpUtils.GetMimeFromContentType(partialDownload.Headers["Content-Type"]) #else partialDownload.Content.Headers.ContentType?.MediaType #endif ); if (continueDownload != null) { if (continueDownload == true && (manager == null || !manager.IsAlive)) { this.partialDownload = partialDownload; manager = new MediaStreamManager(GetResponseAsync, true); } else { partialDownload.AbortAndDispose(); } } OnChanged(); } }
private static async Task <string> DownloadAttemptAsync(Uri url, bool force, IProgress <DataTransferProgress> progress) { var originalUrl = url; string hash; using (var sha1 = new System.Security.Cryptography.SHA1Cng()) { hash = ToHex(sha1.ComputeHash(Encoding.UTF8.GetBytes(url.AbsoluteUri))).ToLower().Substring(0, 16); } lock (typeof(Paper)) { if (cachedPapers == null) { cachedPapers = new Dictionary <string, string>(); foreach (var x in Directory.EnumerateFiles("/Awdee/SciHub", "*.pdf")) { var name = Path.GetFileName(x); var p = name.IndexOf('-'); cachedPapers[p == -1 ? x : name.Substring(0, p)] = x; } } } var r = cachedPapers.TryGetValue(hash); if (r != null) { return(r); } progress.Report("Initializing"); // return; WebFile pdfFile = null; string title = null; HtmlNode original = null; var cookies = new IsolatedCookieContainer(); string doi = null; if (url.IsHostedOn("dx.doi.org")) { doi = HttpUtils.UnescapeDataString(url.AbsolutePath.Substring(1)); } (pdfFile, title) = await TryGetLibgenAsync(doi, null, progress); if (pdfFile == null && url.IsHostedOn("academia.edu")) { cookies.AddRange(HttpUtils.ParseCookies(Configuration_AcademiaEduCookie.Trim())); progress.Report("Retrieving from Academia.edu"); original = await url.GetHtmlNodeAsync(null, cookies); var u = new LazyUri(original.GetLinkUrl("a[data-download],.js-swp-download-button")); u.AppendCookies(cookies); u.AppendFragmentParameter("$header-Referer", url.AbsoluteUri); title = original.GetValue(":property('citation_title')"); pdfFile = WebFile.FromUrlUntracked(u.Url); } if (pdfFile == null) { try { progress.Report("Retrieving plain page"); original = await url.GetHtmlNodeAsync(); doi = original.TryGetValue(":property('citation_doi'),meta[scheme='doi']:property('dc.Identifier')"); if (doi == null && url.IsHostedOn("nih.gov")) { doi = original.TryGetValue("a[ref='aid_type=doi'],.doi > a"); if (doi == null && url.AbsolutePath.StartsWith("/pubmed/")) { progress.Report("Finding DOI on EuropePMC.org"); var alt = await HttpUtils.FormatEscaped("http://europepmc.org/abstract/med/{0}", url.GetPathComponent(1)).GetHtmlNodeAsync(); doi = alt.TryGetValue("meta[name='citation_doi']", "content"); } } if (doi == null && url.IsHostedOn("sciencedirect.com")) { doi = original.TryGetValue("script:json-token('SDM.doi = ')"); } if (doi != null) { (pdfFile, title) = await TryGetLibgenAsync(doi, null, progress); } if (pdfFile == null && url.IsHostedOn("researchgate.net")) { var u = FindPdfLink(original); if (u != null) { pdfFile = WebFile.FromUrlUntracked(u); } } if (title == null) { title = original.TryGetValue(":property('citation_title')"); if (title == null) { title = original.TryGetValue("title")?.TrimEnd(" - PubMed - NCBI"); } } } catch (NotSupportedResponseException ex) when(ex.ContentType == "application/pdf") { pdfFile = WebFile.FromUrlUntracked(url); } } if (pdfFile == null) { if (url.IsHostedOn("nlm.nih.gov")) { var a = original.TryGetLinkUrl(".portlet a"); if (a != null) { url = a; } else { var k = FindPdfLink(original); if (k != null) { pdfFile = WebFile.FromUrlUntracked(k); } } } if (pdfFile == null) { if (!url.IsHostedOn("scielo.br")) { var u = new LazyUri("http://" + url.Host + ".sci-hub.cc" + url.AbsolutePath + url.Query + url.Fragment); progress.Report("Trying on SciHub"); u.AppendFragmentParameter("$allow-same-redirect", "1"); url = u.Url; } else { progress.Report("Trying on " + url.Host); } var scihub = await url.GetHtmlNodeAsync(null, cookies); if (scihub.FindSingle("img#captcha") != null) { throw new CaptchaException(scihub.OwnerDocument.PageUrl); } if (scihub.OwnerDocument.PageUrl.IsHostedOn("libgen.io")) { var u = scihub.GetLinkUrl("a[href*='/ads.php?']"); progress.Report("Found on LibGen.IO"); (pdfFile, title) = await TryGetLibgenAsync(null, u, progress); } else { var pdflink = scihub.TryGetLinkUrl("iframe#pdf") ?? FindPdfLink(scihub); if (pdflink != null) { var u = new LazyUri(pdflink); u.AppendCookies(cookies); pdfFile = WebFile.FromUrlUntracked(u.Url); } } } } if (pdfFile != null) { var uu = new LazyUri(pdfFile.Url); uu.AppendFragmentParameter("$allow-same-redirect", "1"); uu.AppendFragmentParameter("$forbid-html", "1"); pdfFile = WebFile.FromUrlUntracked(uu.Url); if (title == null) { var z = pdfFile.SuggestedFileName; if (z != null) { title = Path.GetFileNameWithoutExtension(z); } } else { title = title.Trim().TrimEnd(".").RegexReplace(@"\s+", "-"); } progress.Report("Downloading from " + pdfFile.Url.Host); string path; try { path = await pdfFile.DownloadAsync("/Awdee/SciHub", hash + "-" + title + ".pdf", WebFile.FileOverwriteMode.Skip, CancellationToken.None, progress); } catch (NotSupportedResponseException ex) { if (ex.Page != null && ex.Page.FindSingle("img#captcha") != null) { throw new CaptchaException(ex.Page.OwnerDocument.PageUrl); } throw; } var filename = Path.GetFileName(path); lock (typeof(Paper)) { cachedPapers[hash] = path; File.AppendAllText("/Awdee/SciHubDownloads.csv", string.Join("\t", originalUrl, title, doi, filename, new FileInfo(path).Length) + "\r\n", Encoding.UTF8); } progress.Report("Done."); return(path); } throw new Exception("Could not find any PDF links."); }