public string ScrapeURLToFile(string url, bool force_download = false, Dictionary <string, string> additional_headers = null) { string cache_key = StreamMD5.FromText(url); string directory = Path.GetFullPath(Path.Combine(base_directory, cache_key.Substring(0, 2))); string filename = Path.GetFullPath(Path.Combine(directory, cache_key)); if (!File.Exists(filename) || force_download) { Utilities.Files.DirectoryTools.CreateDirectory(directory); // Crude throttle if (true) { while (DateTime.UtcNow.Subtract(last_scrape_time).TotalMilliseconds < throttle_ms) { Thread.Sleep(50); } last_scrape_time = DateTime.UtcNow; } Logging.Info("Downloading from {0}", url); using (WebClient client = new WebClient()) { if (null != additional_headers) { foreach (var pair in additional_headers) { client.Headers.Add(pair.Key, pair.Value); } } if (!String.IsNullOrEmpty(userAgent)) { client.Headers.Add("User-agent", userAgent); } string temp_filename = filename + ".tmp"; try { client.DownloadFile(url, temp_filename); } catch (WebException ex) { File.WriteAllText(temp_filename, ex.ToString()); } File.Delete(filename); File.Move(temp_filename, filename); } string filename_manifest = Path.GetFullPath(Path.Combine(base_directory, @"manifest.txt")); string manifest_line = String.Format("{0}\t{1}", cache_key, url); using (StreamWriter sw = File.AppendText(filename_manifest)) { sw.WriteLine(manifest_line); } } return(filename); }
private string MkLegalSizedPath(string basename, string typeIdStr) { const int PATH_MAX = 240; // must be less than 255 / 260 - see also https://kb.acronis.com/content/39790 string root = Path.GetDirectoryName(basename); string name = Path.GetFileName(basename); string dataname = Path.GetFileNameWithoutExtension(DataFile); string ext = SubStr(Path.GetExtension(DataFile), 1).Trim(); // produce the extension without leading dot if (ext.StartsWith("bib")) { ext = SubStr(ext, 3).Trim(); } if (ext.Length > 0) { ext = "." + ext; } // UNC long filename/path support by forcing this to be a UNC path: string filenamebase = $"{dataname}.{name}{ext}{ExtensionWithDot}"; // first make the full path without the approved/received, so that that bit doesn't make a difference // in the length check and subsequent decision to produce a shorthand filename path or not: // It's not always needed, but do the different shorthand conversions anyway and pick the longest fitting one: string short_tn = SanitizeFilename(CamelCaseShorthand(name)); string short_dn = SanitizeFilename(SubStr(dataname, 0, 10) + CamelCaseShorthand(dataname)); string hash = StreamMD5.FromText(filenamebase).ToUpper(); string short_hash = SubStr(hash, 0, Math.Max(6, 11 - short_tn.Length)); // this variant will fit in the length criterium, guaranteed: string alt_filepath0 = Path.GetFullPath(Path.Combine(root, $"{short_dn}.{short_hash}_{short_tn}{ext}{typeIdStr}{ExtensionWithDot}")); string filepath = alt_filepath0; // next, we construct the longer variants to check if they fit. // // DO NOTE that we create a path without typeIdStr part first, because we want both received and approved files to be based // on the *same* alt selection decision! string picked_alt_filepath = Path.GetFullPath(Path.Combine(root, $"{short_dn}.{short_hash}_{short_tn}{ext}.APPROVEDXYZ{ExtensionWithDot}")); name = SanitizeFilename(name); dataname = SanitizeFilename(dataname); string alt_filepath1 = Path.GetFullPath(Path.Combine(root, $"{short_dn}_{short_hash}.{name}{ext}.APPROVEDXYZ{ExtensionWithDot}")); if (alt_filepath1.Length < PATH_MAX) { filepath = Path.GetFullPath(Path.Combine(root, $"{short_dn}_{short_hash}.{name}{ext}{typeIdStr}{ExtensionWithDot}")); picked_alt_filepath = alt_filepath1; } // second alternative: only pick this one if it fits and produces a longer name: string alt_filepath2 = Path.GetFullPath(Path.Combine(root, $"{dataname}.{short_hash}_{short_tn}{ext}.APPROVEDXYZ{ExtensionWithDot}")); if (alt_filepath2.Length < PATH_MAX && alt_filepath2.Length > picked_alt_filepath.Length) { filepath = Path.GetFullPath(Path.Combine(root, $"{dataname}.{short_hash}_{short_tn}{ext}{typeIdStr}{ExtensionWithDot}")); picked_alt_filepath = alt_filepath2; } else { // third alt: the 'optimally trimmed' test name used as part of the filename: int trim_length = PATH_MAX - alt_filepath0.Length + 10 - 1; string short_dn2 = SanitizeFilename(SubStr(dataname, 0, trim_length) + CamelCaseShorthand(dataname)); string alt_filepath3 = Path.GetFullPath(Path.Combine(root, $"{short_dn2}.{short_hash}_{short_tn}{ext}{typeIdStr}{ExtensionWithDot}")); if (alt_filepath3.Length < PATH_MAX && alt_filepath3.Length > picked_alt_filepath.Length) { filepath = Path.GetFullPath(Path.Combine(root, $"{short_dn2}.{short_hash}_{short_tn}{ext}{typeIdStr}{ExtensionWithDot}")); picked_alt_filepath = alt_filepath3; } } // fourth alt: the full, unadulterated path; if it fits in the length criterium, take it anyway string alt_filepath4 = Path.GetFullPath(Path.Combine(root, $"{dataname}.{name}{ext}.APPROVEDXYZ{ExtensionWithDot}")); if (alt_filepath4.Length < PATH_MAX) { // UNC long filename/path support by forcing this to be a UNC path: filepath = Path.GetFullPath(Path.Combine(root, $"{dataname}.{name}{ext}{typeIdStr}{ExtensionWithDot}")); picked_alt_filepath = alt_filepath4; } return(filepath); }