예제 #1
0
        public string ScrapeURLToFile(string url, bool force_download = false, Dictionary <string, string> additional_headers = null)
        {
            string cache_key = StreamMD5.FromText(url);
            string directory = Path.GetFullPath(Path.Combine(base_directory, cache_key.Substring(0, 2)));
            string filename  = Path.GetFullPath(Path.Combine(directory, cache_key));

            if (!File.Exists(filename) || force_download)
            {
                Utilities.Files.DirectoryTools.CreateDirectory(directory);

                // Crude throttle
                if (true)
                {
                    while (DateTime.UtcNow.Subtract(last_scrape_time).TotalMilliseconds < throttle_ms)
                    {
                        Thread.Sleep(50);
                    }
                    last_scrape_time = DateTime.UtcNow;
                }

                Logging.Info("Downloading from {0}", url);
                using (WebClient client = new WebClient())
                {
                    if (null != additional_headers)
                    {
                        foreach (var pair in additional_headers)
                        {
                            client.Headers.Add(pair.Key, pair.Value);
                        }
                    }
                    if (!String.IsNullOrEmpty(userAgent))
                    {
                        client.Headers.Add("User-agent", userAgent);
                    }

                    string temp_filename = filename + ".tmp";
                    try
                    {
                        client.DownloadFile(url, temp_filename);
                    }
                    catch (WebException ex)
                    {
                        File.WriteAllText(temp_filename, ex.ToString());
                    }

                    File.Delete(filename);
                    File.Move(temp_filename, filename);
                }

                string filename_manifest = Path.GetFullPath(Path.Combine(base_directory, @"manifest.txt"));
                string manifest_line     = String.Format("{0}\t{1}", cache_key, url);
                using (StreamWriter sw = File.AppendText(filename_manifest))
                {
                    sw.WriteLine(manifest_line);
                }
            }

            return(filename);
        }
        private string MkLegalSizedPath(string basename, string typeIdStr)
        {
            const int PATH_MAX = 240;  // must be less than 255 / 260 - see also https://kb.acronis.com/content/39790

            string root     = Path.GetDirectoryName(basename);
            string name     = Path.GetFileName(basename);
            string dataname = Path.GetFileNameWithoutExtension(DataFile);
            string ext      = SubStr(Path.GetExtension(DataFile), 1).Trim(); // produce the extension without leading dot

            if (ext.StartsWith("bib"))
            {
                ext = SubStr(ext, 3).Trim();
            }
            if (ext.Length > 0)
            {
                ext = "." + ext;
            }

            // UNC long filename/path support by forcing this to be a UNC path:
            string filenamebase = $"{dataname}.{name}{ext}{ExtensionWithDot}";

            // first make the full path without the approved/received, so that that bit doesn't make a difference
            // in the length check and subsequent decision to produce a shorthand filename path or not:

            // It's not always needed, but do the different shorthand conversions anyway and pick the longest fitting one:
            string short_tn = SanitizeFilename(CamelCaseShorthand(name));
            string short_dn = SanitizeFilename(SubStr(dataname, 0, 10) + CamelCaseShorthand(dataname));

            string hash       = StreamMD5.FromText(filenamebase).ToUpper();
            string short_hash = SubStr(hash, 0, Math.Max(6, 11 - short_tn.Length));

            // this variant will fit in the length criterium, guaranteed:
            string alt_filepath0 = Path.GetFullPath(Path.Combine(root, $"{short_dn}.{short_hash}_{short_tn}{ext}{typeIdStr}{ExtensionWithDot}"));
            string filepath      = alt_filepath0;

            // next, we construct the longer variants to check if they fit.
            //
            // DO NOTE that we create a path without typeIdStr part first, because we want both received and approved files to be based
            // on the *same* alt selection decision!

            string picked_alt_filepath = Path.GetFullPath(Path.Combine(root, $"{short_dn}.{short_hash}_{short_tn}{ext}.APPROVEDXYZ{ExtensionWithDot}"));

            name     = SanitizeFilename(name);
            dataname = SanitizeFilename(dataname);

            string alt_filepath1 = Path.GetFullPath(Path.Combine(root, $"{short_dn}_{short_hash}.{name}{ext}.APPROVEDXYZ{ExtensionWithDot}"));

            if (alt_filepath1.Length < PATH_MAX)
            {
                filepath            = Path.GetFullPath(Path.Combine(root, $"{short_dn}_{short_hash}.{name}{ext}{typeIdStr}{ExtensionWithDot}"));
                picked_alt_filepath = alt_filepath1;
            }

            // second alternative: only pick this one if it fits and produces a longer name:
            string alt_filepath2 = Path.GetFullPath(Path.Combine(root, $"{dataname}.{short_hash}_{short_tn}{ext}.APPROVEDXYZ{ExtensionWithDot}"));

            if (alt_filepath2.Length < PATH_MAX && alt_filepath2.Length > picked_alt_filepath.Length)
            {
                filepath            = Path.GetFullPath(Path.Combine(root, $"{dataname}.{short_hash}_{short_tn}{ext}{typeIdStr}{ExtensionWithDot}"));
                picked_alt_filepath = alt_filepath2;
            }
            else
            {
                // third alt: the 'optimally trimmed' test name used as part of the filename:
                int    trim_length   = PATH_MAX - alt_filepath0.Length + 10 - 1;
                string short_dn2     = SanitizeFilename(SubStr(dataname, 0, trim_length) + CamelCaseShorthand(dataname));
                string alt_filepath3 = Path.GetFullPath(Path.Combine(root, $"{short_dn2}.{short_hash}_{short_tn}{ext}{typeIdStr}{ExtensionWithDot}"));
                if (alt_filepath3.Length < PATH_MAX && alt_filepath3.Length > picked_alt_filepath.Length)
                {
                    filepath            = Path.GetFullPath(Path.Combine(root, $"{short_dn2}.{short_hash}_{short_tn}{ext}{typeIdStr}{ExtensionWithDot}"));
                    picked_alt_filepath = alt_filepath3;
                }
            }

            // fourth alt: the full, unadulterated path; if it fits in the length criterium, take it anyway
            string alt_filepath4 = Path.GetFullPath(Path.Combine(root, $"{dataname}.{name}{ext}.APPROVEDXYZ{ExtensionWithDot}"));

            if (alt_filepath4.Length < PATH_MAX)
            {
                // UNC long filename/path support by forcing this to be a UNC path:
                filepath            = Path.GetFullPath(Path.Combine(root, $"{dataname}.{name}{ext}{typeIdStr}{ExtensionWithDot}"));
                picked_alt_filepath = alt_filepath4;
            }

            return(filepath);
        }